star_qc.R 5.21 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
#!/usr/bin/env Rscript
#+ echo=FALSE, message=FALSE

suppressMessages(library(docopt))

# retrieve and parse the command-line arguments
doc <- '
Create a small summary alignment summary files created when running star.
Usage: star_qc.R <base_directory>
'

#print(paste("args are:", commandArgs(TRUE)))
opts <- docopt(doc, commandArgs(TRUE))
#opts <- docopt(doc, ".")

if(!exists("opts")){ stop(doc); return }
#print(opts)

base_directory <- normalizePath(opts$base_directory)

if(is.na(file.info(base_directory)$isdir)){
    stop(paste("base directory", base_directory, "does not exist"))
}

25 26
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.22/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.22/R/ggplot_commons.R")
27
require_auto(knitr)
Holger Brandl's avatar
Holger Brandl committed
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

########################################################################################################################
#' # STAR Mapping Summary

#' Data directory: `r base_directory`

renaming_scheme=c("%" = "Perc", "/" = " by ", " "="_")


read_star_summary <- function(summaryFile){
    #DEBUG summaryFile="control_4.Log.final.out"

   read.delim(summaryFile, sep="|", header=F) %>%
        mutate_each(funs(str_trim)) %>%
        set_names("variable", "value") %>%
        ## remove dates
        filter(!str_detect(value, " ")) %>% 
        mutate(value=value %>% str_replace(fixed("%"), "") %>% as.numeric()) %>%
        mutate(variable=str_replace_all(variable, renaming_scheme)) %>%
        filter(!is.na(value)) %>%
        mutate(sample=basename(summaryFile) %>% trim_ext(".Log.final.out"))
}
#read_star_summary("control_4.Log.final.out")



algnSummaryLong <- ldply(list.files(base_directory, "final.out", full.names=TRUE, recursive=T), read_star_summary)

algnSummary <- algnSummaryLong %>% dcast(sample ~ variable)
algnSummary %<>% mutate(mapping_efficiency=Perc_of_reads_mapped_to_multiple_loci + Uniquely_mapped_reads_Perc)

59 60
#require_auto(DT)
#datatable(algnSummary)
Holger Brandl's avatar
Holger Brandl committed
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124


write.delim(algnSummary, file="star_mapping_stats.txt")
# algnSummary <- read.delim("star_mapping_stats.txt")
#'  [STAR Mapping Statistics](star_mapping_stats.txt)

#scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }


#+ fig.height=nrow(algnSummary)/2.5+2, fig.width=12
ggplot(algnSummary, aes(sample, Number_of_input_reads)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("read counts") +scale_y_continuous(labels=comma)


algnSummaryLong %>% filter(str_detect(variable, "Uniquely_mapped_reads_Perc|Perc_of_reads_mapped_to_multiple_loci")) %>%
    ggplot(aes(sample, value, fill=variable)) +
    geom_bar(stat="identity") +
    coord_flip()   +
    ylim(0,100) +
    ggtitle("mapping efficiency") +
    theme(legend.position='bottom')
#    guides(fill = guide_legend(position="left"))

#    guide_legend(position="left")

#plotData <- algnSummaryLong %>% filter(str_detect(variable, "Uniquely_mapped_reads_Perc|Perc_of_reads_mapped_to_multiple_loci"))
#    ggplot(plotData, aes(sample, value, color=ac(variable))) +
#    geom_bar(stat="identity", position="dodge") +
#    coord_flip()  +
#    ylim(0,100) +
#    ggtitle("mapping efficiency") +
#    guides(fill = guide_legend(position="bottom"))
#
#
#ggplot(algnSummary, aes(sample, mapping_efficiency)) +
#    geom_bar(stat="identity") +
#    coord_flip() +
#    ylim(0,100) +
#    ggtitle("mapping efficiency")
#
#ggplot(algnSummary, aes(sample, Uniquely_mapped_reads_Perc)) +
#    geom_bar(stat="identity") +
#    coord_flip() +
#    ylim(0,100) +
#    ggtitle("Unique mapper proportions")
#
#ggplot(algnSummary, aes(sample, Perc_of_reads_mapped_to_multiple_loci)) +
#    geom_bar(stat="identity") +
#    coord_flip() +
#    ylim(0,100) +
#    ggtitle("multi mapper proportions")


#ggplot(algnSummary, aes(condition, mapped_reads)) +
#    geom_bar(stat="identity") + coord_flip() +
#    ggtitle("alignments counts") +
#    scale_y_continuous(labels=comma)

########################################################################################################################
#' ## Alignment Correlation

#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
125
#' Here we use [deepTools](https://github.com/fidelram/deepTools) `bamCorrelate` to do so, which computes the overall similarity between two or more BAM files based on read coverage (number of reads) within genomic regions, i.e. for each pair of BAM files, the reads overlapping with the same genomic intervals are counted. Then, the correlation methods are used to determine how similar the counts are for the pairs of BAM files, i.e. if a region contains many reads in file A, does it also contain many reads in file B.
Holger Brandl's avatar
Holger Brandl committed
126

127 128 129 130 131 132 133 134 135 136 137 138 139
#' **[Open Correlation Matrix](bc.pdf)**

###' <a href="bc.pdf"/>
##+ results="asis", echo=FALSE
##cat('<script type="text/javascript">
##$( document ).ready(function() {
##    console.log( "ready!" );
##    $( "#alignment-correlation > p" ).append("<embed src=\\"./bc.pdf\\' width=\\"1100px\\' height=\\'1000px\\'>");
##    $( "#alignment-correlation > p" ).append("<a href=\\"bc.pdf\\">Open Correlation Matrix</a>");
##});
##</script>
## '
### http://stackoverflow.com/questions/291813/recommended-way-to-embed-pdf-in-html