#!/usr/bin/env Rscript #' # FastQC Summary for `r getwd()` ##+ echo=FALSE, message=FALSE ## Note This script is supposed to be knitr::spin'ed devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R") devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R") ## can we access variables from the parent spin.R process? #echo("rscript is ", r_script) argv = commandArgs(TRUE) #echo("argv is ", argv) #if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1] if(length(argv) != 1){ stop("Usage: fastqc_summmary.R <directory with fastqc results>") echo } baseDir=argv[1] if(is.na(file.info(baseDir)$isdir)){ stop(paste("directory '", baseDir,"'does not exist")) } #baseDir="fastqc" #baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc" fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T) #echo("files are", fastqDataFiles) #' ## Number of reads per run readCount <- function(statsFile){ data.frame( run=trim_ext(basename(dirname(statsFile)), c("_fastqc")), num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences ' ", statsFile, " | cut -f2 -d'\t'") ))) ) } readCounts <- fastqDataFiles %>% ldply(readCount) %>% print_head() require.auto(scales) ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) ### Create a faill/pass matrix readSummary <- function(statsFile){ # echo("reading", statsFile) # statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt" fileMapStats <- read.delim(statsFile, header=F) %>% set_names(c("flag", "score", "run")) %>% mutate(run=trim_ext(run, c(".fastq.gz", "fastq"))) fileMapStats } qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary) qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) + geom_tile() + rotXlab() + scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) #' # Base Quality Distribution Summary #' Median qualities per position to compare positional patterns and differences between the runs readBaseQualDist <- function(statsFile){ # statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt" # statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt" # grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' ' # echo("reading", statsFile) baseStats <- read.delim(pipe( paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '") )) %>% mutate( run=trim_ext(basename(dirname(statsFile)), c("_fastqc")) ) baseStats %>% mutate(base_order=1:n()) } baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist) #with(baseQualities, as.data.frame(table(run))) baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40)) #+ warning=FALSE, fig.width=15 runs <- with(baseQualities, as.data.frame(table(run))) #' Qualities per run including variance ## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r baseQualities %>% # head(30) %>% ggplot() + geom_blank(mapping=aes(reorder(Base, base_order))) + geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") + geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) + geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") + geom_boxplot( mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile), stat = "identity" ) + facet_wrap(~run) + theme(axis.text.x=element_blank()) + scale_y_continuous(limits=c(2, 40)) + xlab("")