Newer
Older
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if(length(argv) != 1){
stop("Usage: fastqc_summmary.R <directory with fastqc results>")
baseDir=argv[1]
if(is.na(file.info(baseDir)$isdir)){
stop(paste("directory '", baseDir,"'does not exist"))
#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"
fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)
#' ## Number of reads per run
readCount <- function(statsFile){
data.frame(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences ' ", statsFile, " | cut -f2 -d'\t'") )))
)
}
readCounts <- fastqDataFiles %>% ldply(readCount) %>% print_head()
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ results='asis'
gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
### Create a faill/pass matrix
readSummary <- function(statsFile){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats <- read.delim(statsFile, header=F) %>%
set_names(c("flag", "score", "run")) %>%
mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
fileMapStats
}
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
ggtitle("fastqc passcodes")
#' Median qualities per position to compare positional patterns and differences between the runs
readBaseQualDist <- function(statsFile){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
# statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
# grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
baseStats <- read.delim(pipe(
paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
)) %>% mutate(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
)
baseStats %>% mutate(base_order=1:n())
}
baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist)
#with(baseQualities, as.data.frame(table(run)))
baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
runs <- with(baseQualities, as.data.frame(table(run)))
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
ggplot() +
geom_blank(mapping=aes(reorder(Base, base_order))) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
geom_boxplot(
mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
stat = "identity"
) +
facet_wrap(~run) +
theme(axis.text.x=element_blank()) +
scale_y_continuous(limits=c(2, 40)) +
xlab("")