#!/usr/bin/env Rscript
#' # FastQC Summary for `r getwd()`
##+ echo=FALSE, message=FALSE

## Note This script is supposed to be knitr::spin'ed

devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")

## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)

argv = commandArgs(TRUE)
#echo("argv is ", argv)

#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]

if(length(argv) != 1){
    stop("Usage: fastqc_summmary.R <directory with fastqc results>")
    echo
}

baseDir=argv[1]


if(is.na(file.info(baseDir)$isdir)){
    stop(paste("directory '", baseDir,"'does not exist"))
}

#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"



fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)

#echo("files are", fastqDataFiles)


#' ## Number of reads per run

readCount <- function(statsFile){
    data.frame(
        run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
        num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences	' ", statsFile, " | cut -f2 -d'\t'") )))
    )
}

readCounts <- fastqDataFiles %>% ldply(readCount) %>% print_head()

require.auto(scales)
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")

#+ results='asis'
gg %>%  ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()


### Create a faill/pass matrix

readSummary <- function(statsFile){
    # echo("reading", statsFile)
    # statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"

    fileMapStats <- read.delim(statsFile, header=F) %>%
        set_names(c("flag", "score", "run")) %>%
        mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))

    fileMapStats
}

qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>%  ldply(readSummary)

#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
    geom_tile() +
    rotXlab() +
    scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
    ggtitle("fastqc passcodes")



#' Median qualities per position to compare positional patterns and differences between the runs

readBaseQualDist <- function(statsFile){
    # statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
    # statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
    # grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
#    echo("reading", statsFile)

    baseStats <- read.delim(pipe(
            paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
        )) %>% mutate(
            run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
        )

    baseStats %>% mutate(base_order=1:n())
}

baseQualities <- fastqDataFiles %>%  ldply(readBaseQualDist)

#with(baseQualities, as.data.frame(table(run)))


baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))


#+ warning=FALSE, fig.width=15

runs <- with(baseQualities, as.data.frame(table(run)))

#' Qualities per run including variance


## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
#    head(30) %>%
    ggplot() +
    geom_blank(mapping=aes(reorder(Base, base_order))) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
    geom_boxplot(
            mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax =  X90th.Percentile),
             stat = "identity"
     ) +
    facet_wrap(~run) +
    theme(axis.text.x=element_blank()) +
    scale_y_continuous(limits=c(2, 40)) +
    xlab("")