Skip to content
Snippets Groups Projects
bowtie2_qc.R 2.71 KiB
#!/usr/bin/env Rscript
#+ echo=FALSE, message=FALSE

suppressMessages(library(docopt))

# retrieve and parse the command-line arguments
doc <- '
Create a small summary for bam-files
Usage: bams_qc.R <base_directory>

Options:
-c        Peform correlation analysis
'

opts <- docopt(doc, commandArgs(TRUE))
#opts <- docopt(doc, ".")

if(!exists("opts")){ stop(doc); return }
#print(opts)

baseDir <- normalizePath(opts$base_directory)

if(is.na(file.info(baseDir)$isdir)){
    stop(paste("base directory", baseDir, "does not exist"))
}

devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")


########################################################################################################################
#' # Mapping Summary for: `r baseDir`



logSuffix=".bowtie.log"
parseAlgnSummary <- function(alignSummary){
    #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
    algnData <- readLines(alignSummary)

    data.frame(
        sample=trimEnd(basename(alignSummary), logSuffix),
        num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),

        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
        unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric(),

        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
        multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric()
    )
}

algnSummary <- list.files(baseDir, logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
write.delim(algnSummary, file="bowtie2_qc.txt")

scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }


#+ fig.height=nrow(algnSummary)/3
ggplot(algnSummary, aes(sample, mapping_efficiency)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ylim(0,100) +
    ggtitle("mapping efficiency")

ggplot(algnSummary, aes(sample, num_reads)) +
    geom_bar(stat="identity") +
    coord_flip() +
    ggtitle("read counts") +scale_y_continuous(labels=comma)

ggplot(algnSummary, aes(sample, unique_mapper_prop)) +
    geom_bar(stat="identity") + coord_flip() +
    ggtitle("unique-mapper proportions") +
    scale_y_continuous(labels=comma)

ggplot(algnSummary, aes(sample, multi_mappers_prop)) +
    geom_bar(stat="identity") + coord_flip() +
    ggtitle("multi-mapper proportions") +
    scale_y_continuous(labels=comma)