#!/usr/bin/env Rscript
#' # FastQC Summary for `r getwd()`
##+ echo=FALSE, message=FALSE

## Note This script is supposed to be knitr::spin'ed

devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/ggplot_commons.R")

## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)

argv = commandArgs(TRUE)
#echo("argv is ", argv)

#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]

if(length(argv) != 1){
    stop("Usage: fastqc_summmary.R <directory with fastqc results>")
}

baseDir=argv[1]


if(is.na(file.info(baseDir)$isdir)){
    stop(paste("directory '", baseDir,"'does not exist"))
}

#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"



fastqDataFiles <- list.files(path=baseDir, pattern="*fastqc.zip", full.names=TRUE, recursive=T)

#echo("files are", fastqDataFiles)
get_zip_pipe <- function(zipFile, fileName){
    paste0("unzip -p ",zipFile, " ", trim_ext(basename(zipFile), ".zip"), "/", fileName)
}

#' ## Number of reads per run

#statsFile=fastqDataFiles[1]
readCount <- function(statsFile){
    data.frame(
        run=trim_ext(basename(statsFile), c("_fastqc.zip")),
        num_reads=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Sequences' | cut -f2 -d'\t'"))))
    )
}

readCounts <- fastqDataFiles %>% ldply(readCount) # %>% print_head()

require.auto(scales)
#+ fig.width=12, fig.height=round(nrow(readCounts)/2)
ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")

# #+ results='asis'
# gg %>%  ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()


### Create a faill/pass matrix

readSummary <- function(statsFile){
    # echo("reading", statsFile)
    # statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"

    fileMapStats <- read.delim(pipe(get_zip_pipe(statsFile, "summary.txt")), header=F) %>%
        set_names(c("flag", "score", "run")) %>%
        mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))

    fileMapStats
}

qcSummary <- fastqDataFiles %>%  ldply(readSummary)

#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/2), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
    geom_tile() +
    rotXlab() +
    scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
    ggtitle("fastqc passcodes")


#' # Read duplication & Library Complexity


dupLevels <- fastqDataFiles %>% ldply(function(statsFile){
   data.frame(
       run=trim_ext(basename(statsFile), c("_fastqc.zip")),
       dedup_proportion=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Deduplicated Percentage' | cut -f2 -d'\t'"))))/100
   )
})

require.auto(scales)
#+ fig.width=12, fig.height=round(nrow(dupLevels)/2)
ggplot(dupLevels, aes(run, dedup_proportion)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=percent) + ggtitle("unique_reads/total_reads") + ylim(0,1)




#' # Median qualities per position

# Allows to compare positional patterns and differences between the runs

readBaseQualDist <- function(statsFile){
    # statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
    # statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
    # grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
#    echo("reading", statsFile)

    baseStats <- read.delim(pipe(
            paste(get_zip_pipe(statsFile, "fastqc_data.txt"), " | grep -A60 -F '>>Per base sequence quality' | grep -B100 -F '>>END_MODULE' | head -n-1 | tail -n+2 | tr '#' ' '")
        )) %>% mutate(
            run=trim_ext(basename(statsFile), c(".zip"))
        )

    baseStats %>% mutate(base_order=1:n())
}

baseQualities <- fastqDataFiles %>%  ldply(readBaseQualDist)
statsFile="fastqc/L8038_Track-21511_R1.trim_fastqc.zip"
#with(baseQualities, as.data.frame(table(run)))


#+ fig.widthh=20
baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))



#' # Qualities per run including variance

runs <- with(baseQualities, as.data.frame(table(run)))

#+ warning=FALSE, fig.width=15, fig.height=3*ceiling(nrow(runs)/2)

## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
#    head(30) %>%
    ggplot() +
    geom_blank(mapping=aes(reorder(Base, base_order))) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
    geom_boxplot(
            mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax =  X90th.Percentile),
             stat = "identity"
     ) +
    facet_wrap(~run, ncol=3) +
    theme(axis.text.x=element_blank()) +
    scale_y_continuous(limits=c(2, 40)) +
    xlab("")