From 8e6e0e0496812918a0aa9ac92ffb91a01daf21ec Mon Sep 17 00:00:00 2001 From: Holger Brandl <brandl@mpi-cbg.de> Date: Mon, 9 Mar 2015 12:42:29 +0100 Subject: [PATCH] cont chipseq workflow --- chipseq_workflow/chipseq_utils.sh | 82 ++++++++++++++++++++++++++++++ chipseq_workflow/cs_snippets.sh | 0 dge_workflow/bams_qc.R | 84 ------------------------------- dge_workflow/bowtie2_qc.R | 81 +++++++++++++++++++++++++++++ dge_workflow/tophat_qc.R | 2 +- 5 files changed, 164 insertions(+), 85 deletions(-) create mode 100755 chipseq_workflow/chipseq_utils.sh create mode 100755 chipseq_workflow/cs_snippets.sh delete mode 100755 dge_workflow/bams_qc.R create mode 100755 dge_workflow/bowtie2_qc.R diff --git a/chipseq_workflow/chipseq_utils.sh b/chipseq_workflow/chipseq_utils.sh new file mode 100755 index 0000000..9f72999 --- /dev/null +++ b/chipseq_workflow/chipseq_utils.sh @@ -0,0 +1,82 @@ +## enable snippet spinning +source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/spinr/spin_utils.sh 2>&1 2>/dev/null) + + +## configure paths +#python setup_w_cython.py install --prefix /home/brandl/bin/macs2 + +export PATH=/home/brandl/bin/macs2/bin:$PATH +export PYTHONPATH=/home/brandl/bin/macs2/lib:$PYTHONPATH +export PYTHONPATH=/home/brandl/bin/macs2/lib/python2.7/site-packages:$PYTHONPATH + +#/home/brandl/bin/macs2/bin/macs2 -h + + +cs_bowtie_qc(){ + +echo ' +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R") + +######################################################################################################################## +#> # Bowtie2 Mapping Summary + +#> Directory: `r getwd()` + +logSuffix=".bowtie.log" +parseAlgnSummary <- function(alignSummary){ + #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log" + algnData <- readLines(alignSummary) + + data.frame( + condition=trimEnd(basename(alignSummary), logSuffix), + num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]), + mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")), + + unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]), + unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric(), + + multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]), + multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric() + ) +} + +algnSummary <- list.files(".", logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary) +write.delim(algnSummary, file="bowtie2_qc.txt") + +scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") } + + +#+ fig.height=nrow(algnSummary) +ggplot(algnSummary, aes(condition, mapping_efficiency)) + + geom_bar(stat="identity") + + coord_flip() + + ylim(0,100) + + ggtitle("mapping efficiency") + +ggplot(algnSummary, aes(condition, num_reads)) + + geom_bar(stat="identity") + + coord_flip() + + ggtitle("read counts") +scale_y_continuous(labels=comma) + +ggplot(algnSummary, aes(condition, unique_mapper_prop)) + + geom_bar(stat="identity") + coord_flip() + + ggtitle("unique-mapper proportions") + + scale_y_continuous(labels=comma) + +ggplot(algnSummary, aes(condition, multi_mappers_prop)) + + geom_bar(stat="identity") + coord_flip() + + ggtitle("multi-mapper proportions") + + scale_y_continuous(labels=comma) + +#> ## Bam Correlation + +#> Using simple genome binning, we can performa a simple clustering/corrlation analysis + +## http://stackoverflow.com/questions/291813/recommended-way-to-embed-pdf-in-html +#+ results="asis" +cat("<embed src=\"./bc.pdf\" width=\"900px\" height=\"700px\">") + +' | spinsnip "bowtie2_qc" + +} \ No newline at end of file diff --git a/chipseq_workflow/cs_snippets.sh b/chipseq_workflow/cs_snippets.sh new file mode 100755 index 0000000..e69de29 diff --git a/dge_workflow/bams_qc.R b/dge_workflow/bams_qc.R deleted file mode 100755 index bd86f99..0000000 --- a/dge_workflow/bams_qc.R +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env Rscript -#+ echo=FALSE, message=FALSE - -suppressMessages(library(docopt)) - -# retrieve and parse the command-line arguments -doc <- ' -Create a small summary for bam-files in a directory -Usage: bams_qc.R <base_directory> - -Options: --c Peform correlation analysis -' - -#print(paste("args are:", commandArgs(TRUE))) -opts <- docopt(doc, commandArgs(TRUE)) -#opts <- docopt(doc, ". .") - -if(!exists("opts")){ stop(doc); return } -#print("doc opts are") -#print(opts) - -baseDir <- normalizePath(opts$base_directory) - -if(is.na(file.info(baseDir)$isdir)){ - stop(paste("base directory", baseDir, "does not exist")) -} - -devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R") -devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R") - - -######################################################################################################################## -#' # Mapping Summary for: `r baseDir` - -parseAlgnSummary_T2_0_11 <- function(alignSummary){ - #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt" - algnData <- readLines(alignSummary) - - data.frame( - condition=basename(dirname(alignSummary)), - num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]), - mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1]) - ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads) -} - - -algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11) -write.delim(algnSummary, file="tophat_mapping_stats.txt") -# algnSummary <- read.delim("algnSummary.txt") -#' [Tophat Mapping Statistics](tophat_mapping_stats.txt) - -scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") } - - -ggplot(algnSummary, aes(condition, mapping_efficiency)) + - geom_bar(stat="identity") + - coord_flip() + - ylim(0,100) + - ggtitle("mapping efficiency") - -ggplot(algnSummary, aes(condition, num_reads)) + - geom_bar(stat="identity") + - coord_flip() + - ggtitle("read counts") +scale_y_continuous(labels=comma) - -ggplot(algnSummary, aes(condition, mapped_reads)) + - geom_bar(stat="identity") + coord_flip() + - ggtitle("alignments counts") + - scale_y_continuous(labels=comma) - - -#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0)) -#ggsave2(w=10, h=10, p="mapstats") - -##todo multimapper counting - -######################################################################################################################## -#' ## Alignment Correlation - -#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis. -#' Used tool [deepTools](https://github.com/fidelram/deepTools) - -### todo integrate bam correlation analysis \ No newline at end of file diff --git a/dge_workflow/bowtie2_qc.R b/dge_workflow/bowtie2_qc.R new file mode 100755 index 0000000..35d3bc1 --- /dev/null +++ b/dge_workflow/bowtie2_qc.R @@ -0,0 +1,81 @@ +#!/usr/bin/env Rscript +#+ echo=FALSE, message=FALSE + +suppressMessages(library(docopt)) + +# retrieve and parse the command-line arguments +doc <- ' +Create a small summary for bam-files +Usage: bams_qc.R <base_directory> + +Options: +-c Peform correlation analysis +' + +opts <- docopt(doc, commandArgs(TRUE)) +#opts <- docopt(doc, ".") + +if(!exists("opts")){ stop(doc); return } +#print(opts) + +baseDir <- normalizePath(opts$base_directory) + +if(is.na(file.info(baseDir)$isdir)){ + stop(paste("base directory", baseDir, "does not exist")) +} + +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R") + + +######################################################################################################################## +#' # Mapping Summary for: `r baseDir` + + + +logSuffix=".bowtie.log" +parseAlgnSummary <- function(alignSummary){ + #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log" + algnData <- readLines(alignSummary) + + data.frame( + condition=trimEnd(basename(alignSummary), logSuffix), + num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]), + mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")), + + unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]), + unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric(), + + multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]), + multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric() + ) +} + +algnSummary <- list.files(baseDir, logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary) +write.delim(algnSummary, file="bowtie2_qc.txt") + +scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") } + + +#+ fig.height=nrow(algnSummary)/3 +ggplot(algnSummary, aes(condition, mapping_efficiency)) + + geom_bar(stat="identity") + + coord_flip() + + ylim(0,100) + + ggtitle("mapping efficiency") + +ggplot(algnSummary, aes(condition, num_reads)) + + geom_bar(stat="identity") + + coord_flip() + + ggtitle("read counts") +scale_y_continuous(labels=comma) + +ggplot(algnSummary, aes(condition, unique_mapper_prop)) + + geom_bar(stat="identity") + coord_flip() + + ggtitle("unique-mapper proportions") + + scale_y_continuous(labels=comma) + +ggplot(algnSummary, aes(condition, multi_mappers_prop)) + + geom_bar(stat="identity") + coord_flip() + + ggtitle("multi-mapper proportions") + + scale_y_continuous(labels=comma) + diff --git a/dge_workflow/tophat_qc.R b/dge_workflow/tophat_qc.R index 55c66d4..75cbd4f 100755 --- a/dge_workflow/tophat_qc.R +++ b/dge_workflow/tophat_qc.R @@ -6,7 +6,7 @@ suppressMessages(library(docopt)) # retrieve and parse the command-line arguments doc <- ' Create a small summary alignment summary files created when running tophat2. -Usage: bams_qc.R <base_directory> +Usage: bowtie2_qc.R <base_directory> Options: -c Peform correlation analysis -- GitLab