cont chipseq workflow

8e6e0e04 · Holger Brandl · 37bf0348 · 8e6e0e04 · 8e6e0e04 · 8e6e0e04
Commit 8e6e0e04 authored 10 years ago by Holger Brandl
--- a/chipseq_workflow/chipseq_utils.sh
+++ b/chipseq_workflow/chipseq_utils.sh
+## enable snippet spinning
+source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
+
+
+## configure paths
+#python setup_w_cython.py install --prefix /home/brandl/bin/macs2
+
+export PATH=/home/brandl/bin/macs2/bin:$PATH
+export PYTHONPATH=/home/brandl/bin/macs2/lib:$PYTHONPATH
+export PYTHONPATH=/home/brandl/bin/macs2/lib/python2.7/site-packages:$PYTHONPATH
+
+#/home/brandl/bin/macs2/bin/macs2 -h
+
+
+cs_bowtie_qc(){
+
+echo '
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
+
+########################################################################################################################
+#> # Bowtie2 Mapping Summary
+
+#> Directory: `r getwd()`
+
+logSuffix=".bowtie.log"
+parseAlgnSummary <- function(alignSummary){
+    #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
+    algnData <- readLines(alignSummary)
+
+    data.frame(
+        condition=trimEnd(basename(alignSummary), logSuffix),
+        num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
+        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),
+
+        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
+        unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric(),
+
+        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
+        multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric()
+    )
+}
+
+algnSummary <- list.files(".", logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
+write.delim(algnSummary, file="bowtie2_qc.txt")
+
+scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
+
+
+#+ fig.height=nrow(algnSummary)
+ggplot(algnSummary, aes(condition, mapping_efficiency)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ylim(0,100) +
+    ggtitle("mapping efficiency")
+
+ggplot(algnSummary, aes(condition, num_reads)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ggtitle("read counts") +scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("unique-mapper proportions") +
+    scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("multi-mapper proportions") +
+    scale_y_continuous(labels=comma)
+
+#> ## Bam Correlation
+
+#> Using simple genome binning, we can performa a simple clustering/corrlation analysis
+
+## http://stackoverflow.com/questions/291813/recommended-way-to-embed-pdf-in-html
+#+ results="asis"
+cat("<embed src=\"./bc.pdf\" width=\"900px\" height=\"700px\">")
+
+' | spinsnip "bowtie2_qc"
+
+}
\ No newline at end of file
--- a/chipseq_workflow/cs_snippets.sh
+++ b/chipseq_workflow/cs_snippets.sh
--- a/dge_workflow/bams_qc.R
+++ b/dge_workflow/bams_qc.R
@@ -5,19 +5,17 @@ suppressMessages(library(docopt))

 # retrieve and parse the command-line arguments
 doc <- '
-Create a small summary for bam-files in a directory
+Create a small summary for bam-files
 Usage: bams_qc.R <base_directory>

 Options:
 -c        Peform correlation analysis
 '

-#print(paste("args are:", commandArgs(TRUE)))
 opts <- docopt(doc, commandArgs(TRUE))
-#opts <- docopt(doc, ". .")
+#opts <- docopt(doc, ".")

 if(!exists("opts")){ stop(doc); return }
-#print("doc opts are")
 #print(opts)

 baseDir <- normalizePath(opts$base_directory)
@@ -26,33 +24,40 @@ if(is.na(file.info(baseDir)$isdir)){
    stop(paste("base directory", baseDir, "does not exist"))
 }

-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")


 ########################################################################################################################
 #' # Mapping Summary for: `r baseDir`

-parseAlgnSummary_T2_0_11 <- function(alignSummary){
-    #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
+
+
+logSuffix=".bowtie.log"
+parseAlgnSummary <- function(alignSummary){
+    #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
    algnData <- readLines(alignSummary)

    data.frame(
-        condition=basename(dirname(alignSummary)),
-        num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
-        mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
-    ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
-}
+        condition=trimEnd(basename(alignSummary), logSuffix),
+        num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
+        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),

+        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
+        unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric(),

-algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
-write.delim(algnSummary, file="tophat_mapping_stats.txt")
-# algnSummary <- read.delim("algnSummary.txt")
-#'  [Tophat Mapping Statistics](tophat_mapping_stats.txt)
+        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
+        multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric()
+    )
+}
+
+algnSummary <- list.files(baseDir, logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
+write.delim(algnSummary, file="bowtie2_qc.txt")

 scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }


+#+ fig.height=nrow(algnSummary)/3
 ggplot(algnSummary, aes(condition, mapping_efficiency)) +
    geom_bar(stat="identity") +
    coord_flip() +
@@ -64,21 +69,13 @@ ggplot(algnSummary, aes(condition, num_reads)) +
    coord_flip() +
    ggtitle("read counts") +scale_y_continuous(labels=comma)

-ggplot(algnSummary, aes(condition, mapped_reads)) +
+ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
    geom_bar(stat="identity") + coord_flip() +
-    ggtitle("alignments counts") +
+    ggtitle("unique-mapper proportions") +
    scale_y_continuous(labels=comma)

+ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("multi-mapper proportions") +
+    scale_y_continuous(labels=comma)

-#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
-#ggsave2(w=10, h=10, p="mapstats")
-
-##todo multimapper counting
-
-########################################################################################################################
-#' ## Alignment Correlation
-
-#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
-#' Used tool [deepTools](https://github.com/fidelram/deepTools)
-
-### todo integrate bam correlation analysis
\ No newline at end of file
--- a/dge_workflow/tophat_qc.R
+++ b/dge_workflow/tophat_qc.R
@@ -6,7 +6,7 @@ suppressMessages(library(docopt))
 # retrieve and parse the command-line arguments
 doc <- '
 Create a small summary alignment summary files created when running tophat2.
-Usage: bams_qc.R <base_directory>
+Usage: bowtie2_qc.R <base_directory>

 Options:
 -c        Peform correlation analysis