Skip to content
Snippets Groups Projects
Commit 4da6d8d1 authored by Holger Brandl's avatar Holger Brandl
Browse files

simplified report generation

parent 0f027c52
No related branches found
No related tags found
No related merge requests found
......@@ -32,27 +32,31 @@ if(is.na(file.info(baseDir)$isdir)){
fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)
fastqDataFiles <- list.files(path=baseDir, pattern="*fastqc.zip", full.names=TRUE, recursive=T)
#echo("files are", fastqDataFiles)
get_zip_pipe <- function(zipFile, fileName){
paste0("unzip -p ",zipFile, " ", trim_ext(basename(zipFile), ".zip"), "/", fileName)
}
#' ## Number of reads per run
#statsFile=fastqDataFiles[1]
readCount <- function(statsFile){
data.frame(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences ' ", statsFile, " | cut -f2 -d'\t'") )))
run=trim_ext(basename(statsFile), c("_fastqc.zip")),
num_reads=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Sequences' | cut -f2 -d'\t'"))))
)
}
readCounts <- fastqDataFiles %>% ldply(readCount) %>% print_head()
readCounts <- fastqDataFiles %>% ldply(readCount) # %>% print_head()
require.auto(scales)
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ fig.width=12, fig.height=round(nrow(readCounts)/3)
ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ results='asis'
gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
# #+ results='asis'
# gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
### Create a faill/pass matrix
......@@ -61,17 +65,17 @@ readSummary <- function(statsFile){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats <- read.delim(statsFile, header=F) %>%
fileMapStats <- read.delim(pipe(get_zip_pipe(statsFile, "summary.txt")), header=F) %>%
set_names(c("flag", "score", "run")) %>%
mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
fileMapStats
}
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
qcSummary <- fastqDataFiles %>% ldply(readSummary)
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
#+ fig.height=2+round(nrow(readCounts)/3), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
geom_tile() +
rotXlab() +
......@@ -79,8 +83,26 @@ qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
ggtitle("fastqc passcodes")
#' # Read duplication & Library Complexity
dupLevels <- fastqDataFiles %>% ldply(function(statsFile){
data.frame(
run=trim_ext(basename(statsFile), c("_fastqc.zip")),
dedup_proportion=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Deduplicated Percentage' | cut -f2 -d'\t'"))))/100
)
})
require.auto(scales)
#+ fig.width=12, fig.height=round(nrow(dupLevels)/3)
ggplot(dupLevels, aes(run, dedup_proportion)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=percent) + ggtitle("unique_reads/total_reads") + ylim(0,1)
#' Median qualities per position to compare positional patterns and differences between the runs
#' # Median qualities per position
# Allows to compare positional patterns and differences between the runs
readBaseQualDist <- function(statsFile){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
......@@ -89,9 +111,9 @@ readBaseQualDist <- function(statsFile){
# echo("reading", statsFile)
baseStats <- read.delim(pipe(
paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
paste(get_zip_pipe(statsFile, "fastqc_data.txt"), " | grep -A50 -F '>>Per base sequence quality' | grep -B100 -F '>>END_MODULE' | head -n-1 | tail -n+2 | tr '#' ' '")
)) %>% mutate(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
run=trim_ext(basename(statsFile), c(".zip"))
)
baseStats %>% mutate(base_order=1:n())
......@@ -102,15 +124,16 @@ baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist)
#with(baseQualities, as.data.frame(table(run)))
#+ fig.widthh=20
baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
#+ warning=FALSE, fig.width=15
runs <- with(baseQualities, as.data.frame(table(run)))
#' # Qualities per run including variance
#' Qualities per run including variance
runs <- with(baseQualities, as.data.frame(table(run)))
#+ warning=FALSE, fig.width=15, fig.height=3*ceiling(nrow(runs)/3)
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
......@@ -124,7 +147,7 @@ baseQualities %>%
mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
stat = "identity"
) +
facet_wrap(~run) +
facet_wrap(~run, ncol=3) +
theme(axis.text.x=element_blank()) +
scale_y_continuous(limits=c(2, 40)) +
xlab("")
......
#!/usr/bin/env Rscript
#' # FastQC Summary for `r getwd()`
##+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv = commandArgs(TRUE)
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if(length(argv) != 1){
stop("Usage: fastqc_summmary.R <directory with fastqc results>")
echo
}
baseDir=argv[1]
if(is.na(file.info(baseDir)$isdir)){
stop(paste("directory '", baseDir,"'does not exist"))
}
#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"
fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)
#echo("files are", fastqDataFiles)
#' ## Number of reads per run
readCount <- function(statsFile){
data.frame(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences ' ", statsFile, " | cut -f2 -d'\t'") )))
)
}
readCounts <- fastqDataFiles %>% ldply(readCount) %>% print_head()
require.auto(scales)
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ results='asis'
gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
### Create a faill/pass matrix
readSummary <- function(statsFile){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats <- read.delim(statsFile, header=F) %>%
set_names(c("flag", "score", "run")) %>%
mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
fileMapStats
}
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
geom_tile() +
rotXlab() +
scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
ggtitle("fastqc passcodes")
#' Median qualities per position to compare positional patterns and differences between the runs
readBaseQualDist <- function(statsFile){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
# statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
# grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
# echo("reading", statsFile)
baseStats <- read.delim(pipe(
paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
)) %>% mutate(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
)
baseStats %>% mutate(base_order=1:n())
}
baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist)
#with(baseQualities, as.data.frame(table(run)))
baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
#+ warning=FALSE, fig.width=15
runs <- with(baseQualities, as.data.frame(table(run)))
#' Qualities per run including variance
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
# head(30) %>%
ggplot() +
geom_blank(mapping=aes(reorder(Base, base_order))) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
geom_boxplot(
mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
stat = "identity"
) +
facet_wrap(~run) +
theme(axis.text.x=element_blank()) +
scale_y_continuous(limits=c(2, 40)) +
xlab("")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment