diff --git a/dge_workflow/bam_qc.R b/dge_workflow/bams_qc.R similarity index 98% rename from dge_workflow/bam_qc.R rename to dge_workflow/bams_qc.R index 5213ec59ba0282367a86573a02973b1a70fc9a83..bd86f995fdf8575a38c4dfcae178753445b90517 100755 --- a/dge_workflow/bam_qc.R +++ b/dge_workflow/bams_qc.R @@ -6,7 +6,7 @@ suppressMessages(library(docopt)) # retrieve and parse the command-line arguments doc <- ' Create a small summary for bam-files in a directory -Usage: bam_qc.R <base_directory> +Usage: bams_qc.R <base_directory> Options: -c Peform correlation analysis diff --git a/dge_workflow/dge_analysis.R b/dge_workflow/dge_analysis.R index 998558367cbc1c8867cadbf76ffdb55815e661da..70322b30fa13a3bcb6c693100697327b32cda14b 100755 --- a/dge_workflow/dge_analysis.R +++ b/dge_workflow/dge_analysis.R @@ -19,7 +19,7 @@ Options: # Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05 ..") # opts <- docopt(doc, "--undirected ..") # opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), "..")) - +## todo use commandArgs here!! opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" "))) #opts @@ -38,11 +38,11 @@ if(is.na(file.info(cuffdb_directory)$isdir)){ } -devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R") -devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R") require(cummeRbund) -devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R") #knitr::opts_knit$set(root.dir = getwd()) @@ -348,6 +348,7 @@ sigEnrResults %>% do({ print(logPlot) }) +ggsave2() #+ include=FALSE #ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type)) diff --git a/dge_workflow/dge_utils.sh b/dge_workflow/dge_utils.sh index 4e6cfb3ccd1bbb0d56bd127e6565c6a5dfaf4c11..69506411953ae4958d9d1ee9f2d6a5e9b1bd6f3b 100755 --- a/dge_workflow/dge_utils.sh +++ b/dge_workflow/dge_utils.sh @@ -162,7 +162,7 @@ ziprm tophat_logs ${project}__tophat__*.log dge_bam_correlate . ## create tophat mapping report -spin.R $DGE_HOME/bam_qc.R . +spin.R $DGE_HOME/tophat_qc.R . mailme "$project: tophat done in $(pwd)" @@ -194,7 +194,7 @@ export -f dge_bam_correlate ## Merge technical replicatews dge_merge_treps(){ -usage="Usage: dge_bam_correlate <bam_directory> <comma_sep_biosample_spec>" +usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>" if [ $# -ne 2 ]; then echo $usage >&2 ; return; @@ -221,6 +221,7 @@ for sample in $(echo $bio_samples | tr ",", " "); do sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample) echo "merging $sample with $sampleBams" + ## todo add read-groups to bam files if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then cp $sampleBams $sample.bam else @@ -239,6 +240,26 @@ export -f dge_merge_treps +#dge_cd(){ +#( ## required to work around docopt sys.exit +#usage=' +#Use cuffdiff2 to performa a differntial expression analysis +#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels> +# +#Options: +#--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded) +#' +#echo "$usage" | ~/bin/docopts/docopts -h - : $* +#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)" +# +#) +#} + + +#dge_cd $gtfFilePC $baseDir/mapped $labels +#dge_cd --help + + dge_cuffdiff(){ local gtfFile=$1 @@ -259,6 +280,12 @@ fi ## collect all bam-files and group them allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort) + +if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then + echo "excluding bams with $EXCLUDE_BAMS_PATTERN" + allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN") +fi + ## todo use optional argument here --> default "unmapped" --> allow user to add others #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort) #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort) diff --git a/dge_workflow/tophat_qc.R b/dge_workflow/tophat_qc.R new file mode 100755 index 0000000000000000000000000000000000000000..55c66d43c2f34d60f5ec81c9cf82239f3b00896a --- /dev/null +++ b/dge_workflow/tophat_qc.R @@ -0,0 +1,84 @@ +#!/usr/bin/env Rscript +#+ echo=FALSE, message=FALSE + +suppressMessages(library(docopt)) + +# retrieve and parse the command-line arguments +doc <- ' +Create a small summary alignment summary files created when running tophat2. +Usage: bams_qc.R <base_directory> + +Options: +-c Peform correlation analysis +' + +#print(paste("args are:", commandArgs(TRUE))) +opts <- docopt(doc, commandArgs(TRUE)) +#opts <- docopt(doc, ". .") + +if(!exists("opts")){ stop(doc); return } +#print("doc opts are") +#print(opts) + +baseDir <- normalizePath(opts$base_directory) + +if(is.na(file.info(baseDir)$isdir)){ + stop(paste("base directory", baseDir, "does not exist")) +} + +devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R") +devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R") + + +######################################################################################################################## +#' # Mapping Summary for: `r baseDir` + +parseAlgnSummary_T2_0_11 <- function(alignSummary){ + #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt" + algnData <- readLines(alignSummary) + + data.frame( + condition=basename(dirname(alignSummary)), + num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]), + mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1]) + ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads) +} + + +algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11) +write.delim(algnSummary, file="tophat_mapping_stats.txt") +# algnSummary <- read.delim("algnSummary.txt") +#' [Tophat Mapping Statistics](tophat_mapping_stats.txt) + +scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") } + + +ggplot(algnSummary, aes(condition, mapping_efficiency)) + + geom_bar(stat="identity") + + coord_flip() + + ylim(0,100) + + ggtitle("mapping efficiency") + +ggplot(algnSummary, aes(condition, num_reads)) + + geom_bar(stat="identity") + + coord_flip() + + ggtitle("read counts") +scale_y_continuous(labels=comma) + +ggplot(algnSummary, aes(condition, mapped_reads)) + + geom_bar(stat="identity") + coord_flip() + + ggtitle("alignments counts") + + scale_y_continuous(labels=comma) + + +#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0)) +#ggsave2(w=10, h=10, p="mapstats") + +##todo multimapper counting + +######################################################################################################################## +#' ## Alignment Correlation + +#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis. +#' Used tool [deepTools](https://github.com/fidelram/deepTools) + +### todo integrate bam correlation analysis \ No newline at end of file