started chipseq workflow

37bf0348 · Holger Brandl · d6cd88e6 · 37bf0348 · 37bf0348 · 37bf0348
Commit 37bf0348 authored 10 years ago by Holger Brandl
--- a/dge_workflow/bam_qc.R
+++ b/dge_workflow/bam_qc.R
@@ -6,7 +6,7 @@ suppressMessages(library(docopt))
 # retrieve and parse the command-line arguments
 doc <- '
 Create a small summary for bam-files in a directory
-Usage: bam_qc.R <base_directory>
+Usage: bams_qc.R <base_directory>
 Options:
 -c        Peform correlation analysis

--- a/dge_workflow/dge_analysis.R
+++ b/dge_workflow/dge_analysis.R
@@ -19,7 +19,7 @@ Options:
 # Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05  ..")
 # opts <- docopt(doc, "--undirected  ..")
 # opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
+## todo use commandArgs here!!
 opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
 #opts
@@ -38,11 +38,11 @@ if(is.na(file.info(cuffdb_directory)$isdir)){
 }
-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
 require(cummeRbund)
-devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
 #knitr::opts_knit$set(root.dir = getwd())
@@ -348,6 +348,7 @@ sigEnrResults %>% do({
 	    print(logPlot)
 })
+ggsave2()
 #+ include=FALSE
 #ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))

--- a/dge_workflow/dge_utils.sh
+++ b/dge_workflow/dge_utils.sh
@@ -162,7 +162,7 @@ ziprm tophat_logs ${project}__tophat__*.log
 dge_bam_correlate .
 ## create tophat mapping report
-spin.R $DGE_HOME/bam_qc.R .
+spin.R $DGE_HOME/tophat_qc.R .
 mailme "$project: tophat done in $(pwd)"
@@ -194,7 +194,7 @@ export -f dge_bam_correlate
 ## Merge technical replicatews
 dge_merge_treps(){
-usage="Usage: dge_bam_correlate <bam_directory> <comma_sep_biosample_spec>"
+usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
 if [ $# -ne 2 ]; then
    echo $usage >&2 ; return;
@@ -221,6 +221,7 @@ for sample in $(echo $bio_samples | tr ",", " "); do
    sampleBams=$(find $bam_dir -name '*bam'  | grep -v unmapped | grep $sample)
    echo "merging $sample with $sampleBams"
+    ## todo add read-groups to bam files
    if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
        cp $sampleBams $sample.bam
    else
@@ -239,6 +240,26 @@ export -f dge_merge_treps
+#dge_cd(){
+#( ## required to work around docopt sys.exit
+#usage='
+#Use cuffdiff2 to performa a differntial expression analysis
+#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels>
+#
+#Options:
+#--exclude=<regex>   Exclude patterns (grep regex matching bam file paths to be excluded)
+#'
+#echo "$usage" | ~/bin/docopts/docopts -h -  : $*
+#eval "$(echo "$usage" | ~/bin/docopts/docopts -h -  : $*)"
+#
+#)
+#}
+#dge_cd $gtfFilePC $baseDir/mapped $labels
+#dge_cd --help
 dge_cuffdiff(){
 local gtfFile=$1
@@ -259,6 +280,12 @@ fi
 ## collect all bam-files and group them
 allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
+if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then
+    echo "excluding bams with $EXCLUDE_BAMS_PATTERN"
+    allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN")
+fi
 ## todo use optional argument here --> default "unmapped" --> allow user to add others
 #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
 #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)

--- a/dge_workflow/tophat_qc.R
+++ b/dge_workflow/tophat_qc.R
+#!/usr/bin/env Rscript
+#+ echo=FALSE, message=FALSE
+suppressMessages(library(docopt))
+# retrieve and parse the command-line arguments
+doc <- '
+Create a small summary alignment summary files created when running tophat2.
+Usage: bams_qc.R <base_directory>
+Options:
+-c        Peform correlation analysis
+'
+#print(paste("args are:", commandArgs(TRUE)))
+opts <- docopt(doc, commandArgs(TRUE))
+#opts <- docopt(doc, ". .")
+if(!exists("opts")){ stop(doc); return }
+#print("doc opts are")
+#print(opts)
+baseDir <- normalizePath(opts$base_directory)
+if(is.na(file.info(baseDir)$isdir)){
+    stop(paste("base directory", baseDir, "does not exist"))
+}
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
+########################################################################################################################
+#' # Mapping Summary for: `r baseDir`
+parseAlgnSummary_T2_0_11 <- function(alignSummary){
+    #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
+    algnData <- readLines(alignSummary)
+    data.frame(
+        condition=basename(dirname(alignSummary)),
+        num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
+        mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
+    ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
+}
+algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
+write.delim(algnSummary, file="tophat_mapping_stats.txt")
+# algnSummary <- read.delim("algnSummary.txt")
+#'  [Tophat Mapping Statistics](tophat_mapping_stats.txt)
+scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
+ggplot(algnSummary, aes(condition, mapping_efficiency)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ylim(0,100) +
+    ggtitle("mapping efficiency")
+ggplot(algnSummary, aes(condition, num_reads)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ggtitle("read counts") +scale_y_continuous(labels=comma)
+ggplot(algnSummary, aes(condition, mapped_reads)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("alignments counts") +
+    scale_y_continuous(labels=comma)
+#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
+#ggsave2(w=10, h=10, p="mapstats")
+##todo multimapper counting
+########################################################################################################################
+#' ## Alignment Correlation
+#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
+#' Used tool [deepTools](https://github.com/fidelram/deepTools)
+### todo integrate bam correlation analysis
\ No newline at end of file