tophat_qc.R

#!/usr/bin/env Rscript
#+ echo=FALSE, message=FALSE

suppressMessages(library(docopt))

# retrieve and parse the command-line arguments
doc <- '
Create a small summary alignment summary files created when running tophat2.
Usage: tophat_qc.R <base_directory>

Options:
-c        Peform correlation analysis
'

#print(paste("args are:", commandArgs(TRUE)))
opts <- docopt(doc, commandArgs(TRUE))
#opts <- docopt(doc, ". .")

if(!exists("opts")){ stop(doc); return }
#print("doc opts are")
#print(opts)

baseDir <- normalizePath(opts$base_directory)

if(is.na(file.info(baseDir)$isdir)){
    stop(paste("base directory", baseDir, "does not exist"))
}

devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")


########################################################################################################################
#' # Mapping Summary for: `r baseDir`

parseAlgnSummary_T2_0_11 <- function(alignSummary){
    #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
    algnData <- readLines(alignSummary)

    data.frame(
        condition=basename(dirname(alignSummary)),
        num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
        mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
    ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
}


algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
write.delim(algnSummary, file="tophat_mapping_stats.txt")
# algnSummary <- read.delim("algnSummary.txt")
#'  [Tophat Mapping Statistics](tophat_mapping_stats.txt)

scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }


ggplot(algnSummary, aes(condition, mapping_efficiency)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ylim(0,100) +
    ggtitle("mapping efficiency")

ggplot(algnSummary, aes(condition, num_reads)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("read counts") +scale_y_continuous(labels=comma)

ggplot(algnSummary, aes(condition, mapped_reads)) +
    geom_bar(stat="identity") + coord_flip() +
    ggtitle("alignments counts") +
    scale_y_continuous(labels=comma)


#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")

##todo multimapper counting

########################################################################################################################
#' ## Alignment Correlation

#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
#' Used tool [deepTools](https://github.com/fidelram/deepTools)

### todo integrate bam correlation analysis