From 8e6e0e0496812918a0aa9ac92ffb91a01daf21ec Mon Sep 17 00:00:00 2001
From: Holger Brandl <brandl@mpi-cbg.de>
Date: Mon, 9 Mar 2015 12:42:29 +0100
Subject: [PATCH] cont chipseq workflow

---
 chipseq_workflow/chipseq_utils.sh | 82 ++++++++++++++++++++++++++++++
 chipseq_workflow/cs_snippets.sh   |  0
 dge_workflow/bams_qc.R            | 84 -------------------------------
 dge_workflow/bowtie2_qc.R         | 81 +++++++++++++++++++++++++++++
 dge_workflow/tophat_qc.R          |  2 +-
 5 files changed, 164 insertions(+), 85 deletions(-)
 create mode 100755 chipseq_workflow/chipseq_utils.sh
 create mode 100755 chipseq_workflow/cs_snippets.sh
 delete mode 100755 dge_workflow/bams_qc.R
 create mode 100755 dge_workflow/bowtie2_qc.R

diff --git a/chipseq_workflow/chipseq_utils.sh b/chipseq_workflow/chipseq_utils.sh
new file mode 100755
index 0000000..9f72999
--- /dev/null
+++ b/chipseq_workflow/chipseq_utils.sh
@@ -0,0 +1,82 @@
+## enable snippet spinning
+source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
+
+
+## configure paths
+#python setup_w_cython.py install --prefix /home/brandl/bin/macs2
+
+export PATH=/home/brandl/bin/macs2/bin:$PATH
+export PYTHONPATH=/home/brandl/bin/macs2/lib:$PYTHONPATH
+export PYTHONPATH=/home/brandl/bin/macs2/lib/python2.7/site-packages:$PYTHONPATH
+
+#/home/brandl/bin/macs2/bin/macs2 -h
+
+
+cs_bowtie_qc(){
+
+echo '
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
+
+########################################################################################################################
+#> # Bowtie2 Mapping Summary
+
+#> Directory: `r getwd()`
+
+logSuffix=".bowtie.log"
+parseAlgnSummary <- function(alignSummary){
+    #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
+    algnData <- readLines(alignSummary)
+
+    data.frame(
+        condition=trimEnd(basename(alignSummary), logSuffix),
+        num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
+        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),
+
+        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
+        unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric(),
+
+        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
+        multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric()
+    )
+}
+
+algnSummary <- list.files(".", logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
+write.delim(algnSummary, file="bowtie2_qc.txt")
+
+scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
+
+
+#+ fig.height=nrow(algnSummary)
+ggplot(algnSummary, aes(condition, mapping_efficiency)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ylim(0,100) +
+    ggtitle("mapping efficiency")
+
+ggplot(algnSummary, aes(condition, num_reads)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ggtitle("read counts") +scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("unique-mapper proportions") +
+    scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("multi-mapper proportions") +
+    scale_y_continuous(labels=comma)
+
+#> ## Bam Correlation
+
+#> Using simple genome binning, we can performa a simple clustering/corrlation analysis
+
+## http://stackoverflow.com/questions/291813/recommended-way-to-embed-pdf-in-html
+#+ results="asis"
+cat("<embed src=\"./bc.pdf\" width=\"900px\" height=\"700px\">")
+
+' | spinsnip "bowtie2_qc"
+
+}
\ No newline at end of file
diff --git a/chipseq_workflow/cs_snippets.sh b/chipseq_workflow/cs_snippets.sh
new file mode 100755
index 0000000..e69de29
diff --git a/dge_workflow/bams_qc.R b/dge_workflow/bams_qc.R
deleted file mode 100755
index bd86f99..0000000
--- a/dge_workflow/bams_qc.R
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env Rscript
-#+ echo=FALSE, message=FALSE
-
-suppressMessages(library(docopt))
-
-# retrieve and parse the command-line arguments
-doc <- '
-Create a small summary for bam-files in a directory
-Usage: bams_qc.R <base_directory>
-
-Options:
--c        Peform correlation analysis
-'
-
-#print(paste("args are:", commandArgs(TRUE)))
-opts <- docopt(doc, commandArgs(TRUE))
-#opts <- docopt(doc, ". .")
-
-if(!exists("opts")){ stop(doc); return }
-#print("doc opts are")
-#print(opts)
-
-baseDir <- normalizePath(opts$base_directory)
-
-if(is.na(file.info(baseDir)$isdir)){
-    stop(paste("base directory", baseDir, "does not exist"))
-}
-
-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
-
-
-########################################################################################################################
-#' # Mapping Summary for: `r baseDir`
-
-parseAlgnSummary_T2_0_11 <- function(alignSummary){
-    #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
-    algnData <- readLines(alignSummary)
-
-    data.frame(
-        condition=basename(dirname(alignSummary)),
-        num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
-        mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
-    ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
-}
-
-
-algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
-write.delim(algnSummary, file="tophat_mapping_stats.txt")
-# algnSummary <- read.delim("algnSummary.txt")
-#'  [Tophat Mapping Statistics](tophat_mapping_stats.txt)
-
-scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
-
-
-ggplot(algnSummary, aes(condition, mapping_efficiency)) +
-    geom_bar(stat="identity") +
-    coord_flip() +
-    ylim(0,100) +
-    ggtitle("mapping efficiency")
-
-ggplot(algnSummary, aes(condition, num_reads)) +
-    geom_bar(stat="identity") +
-    coord_flip() +
-    ggtitle("read counts") +scale_y_continuous(labels=comma)
-
-ggplot(algnSummary, aes(condition, mapped_reads)) +
-    geom_bar(stat="identity") + coord_flip() +
-    ggtitle("alignments counts") +
-    scale_y_continuous(labels=comma)
-
-
-#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
-#ggsave2(w=10, h=10, p="mapstats")
-
-##todo multimapper counting
-
-########################################################################################################################
-#' ## Alignment Correlation
-
-#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
-#' Used tool [deepTools](https://github.com/fidelram/deepTools)
-
-### todo integrate bam correlation analysis
\ No newline at end of file
diff --git a/dge_workflow/bowtie2_qc.R b/dge_workflow/bowtie2_qc.R
new file mode 100755
index 0000000..35d3bc1
--- /dev/null
+++ b/dge_workflow/bowtie2_qc.R
@@ -0,0 +1,81 @@
+#!/usr/bin/env Rscript
+#+ echo=FALSE, message=FALSE
+
+suppressMessages(library(docopt))
+
+# retrieve and parse the command-line arguments
+doc <- '
+Create a small summary for bam-files
+Usage: bams_qc.R <base_directory>
+
+Options:
+-c        Peform correlation analysis
+'
+
+opts <- docopt(doc, commandArgs(TRUE))
+#opts <- docopt(doc, ".")
+
+if(!exists("opts")){ stop(doc); return }
+#print(opts)
+
+baseDir <- normalizePath(opts$base_directory)
+
+if(is.na(file.info(baseDir)$isdir)){
+    stop(paste("base directory", baseDir, "does not exist"))
+}
+
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
+
+
+########################################################################################################################
+#' # Mapping Summary for: `r baseDir`
+
+
+
+logSuffix=".bowtie.log"
+parseAlgnSummary <- function(alignSummary){
+    #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
+    algnData <- readLines(alignSummary)
+
+    data.frame(
+        condition=trimEnd(basename(alignSummary), logSuffix),
+        num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
+        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),
+
+        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
+        unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric(),
+
+        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
+        multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric()
+    )
+}
+
+algnSummary <- list.files(baseDir, logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
+write.delim(algnSummary, file="bowtie2_qc.txt")
+
+scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
+
+
+#+ fig.height=nrow(algnSummary)/3
+ggplot(algnSummary, aes(condition, mapping_efficiency)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ylim(0,100) +
+    ggtitle("mapping efficiency")
+
+ggplot(algnSummary, aes(condition, num_reads)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ggtitle("read counts") +scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("unique-mapper proportions") +
+    scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("multi-mapper proportions") +
+    scale_y_continuous(labels=comma)
+
diff --git a/dge_workflow/tophat_qc.R b/dge_workflow/tophat_qc.R
index 55c66d4..75cbd4f 100755
--- a/dge_workflow/tophat_qc.R
+++ b/dge_workflow/tophat_qc.R
@@ -6,7 +6,7 @@ suppressMessages(library(docopt))
 # retrieve and parse the command-line arguments
 doc <- '
 Create a small summary alignment summary files created when running tophat2.
-Usage: bams_qc.R <base_directory>
+Usage: bowtie2_qc.R <base_directory>
 
 Options:
 -c        Peform correlation analysis
-- 
GitLab