Skip to content
Snippets Groups Projects
Commit 37bf0348 authored by Holger Brandl's avatar Holger Brandl
Browse files

started chipseq workflow

parent d6cd88e6
No related branches found
No related tags found
No related merge requests found
......@@ -6,7 +6,7 @@ suppressMessages(library(docopt))
# retrieve and parse the command-line arguments
doc <- '
Create a small summary for bam-files in a directory
Usage: bam_qc.R <base_directory>
Usage: bams_qc.R <base_directory>
Options:
-c Peform correlation analysis
......
......@@ -19,7 +19,7 @@ Options:
# Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05 ..")
# opts <- docopt(doc, "--undirected ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
## todo use commandArgs here!!
opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
#opts
......@@ -38,11 +38,11 @@ if(is.na(file.info(cuffdb_directory)$isdir)){
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
#knitr::opts_knit$set(root.dir = getwd())
......@@ -348,6 +348,7 @@ sigEnrResults %>% do({
print(logPlot)
})
ggsave2()
#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
......
......@@ -162,7 +162,7 @@ ziprm tophat_logs ${project}__tophat__*.log
dge_bam_correlate .
## create tophat mapping report
spin.R $DGE_HOME/bam_qc.R .
spin.R $DGE_HOME/tophat_qc.R .
mailme "$project: tophat done in $(pwd)"
......@@ -194,7 +194,7 @@ export -f dge_bam_correlate
## Merge technical replicatews
dge_merge_treps(){
usage="Usage: dge_bam_correlate <bam_directory> <comma_sep_biosample_spec>"
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
if [ $# -ne 2 ]; then
echo $usage >&2 ; return;
......@@ -221,6 +221,7 @@ for sample in $(echo $bio_samples | tr ",", " "); do
sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample)
echo "merging $sample with $sampleBams"
## todo add read-groups to bam files
if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
cp $sampleBams $sample.bam
else
......@@ -239,6 +240,26 @@ export -f dge_merge_treps
#dge_cd(){
#( ## required to work around docopt sys.exit
#usage='
#Use cuffdiff2 to performa a differntial expression analysis
#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels>
#
#Options:
#--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded)
#'
#echo "$usage" | ~/bin/docopts/docopts -h - : $*
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)"
#
#)
#}
#dge_cd $gtfFilePC $baseDir/mapped $labels
#dge_cd --help
dge_cuffdiff(){
local gtfFile=$1
......@@ -259,6 +280,12 @@ fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then
echo "excluding bams with $EXCLUDE_BAMS_PATTERN"
allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN")
fi
## todo use optional argument here --> default "unmapped" --> allow user to add others
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)
......
#!/usr/bin/env Rscript
#+ echo=FALSE, message=FALSE
suppressMessages(library(docopt))
# retrieve and parse the command-line arguments
doc <- '
Create a small summary alignment summary files created when running tophat2.
Usage: bams_qc.R <base_directory>
Options:
-c Peform correlation analysis
'
#print(paste("args are:", commandArgs(TRUE)))
opts <- docopt(doc, commandArgs(TRUE))
#opts <- docopt(doc, ". .")
if(!exists("opts")){ stop(doc); return }
#print("doc opts are")
#print(opts)
baseDir <- normalizePath(opts$base_directory)
if(is.na(file.info(baseDir)$isdir)){
stop(paste("base directory", baseDir, "does not exist"))
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
########################################################################################################################
#' # Mapping Summary for: `r baseDir`
parseAlgnSummary_T2_0_11 <- function(alignSummary){
#alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
algnData <- readLines(alignSummary)
data.frame(
condition=basename(dirname(alignSummary)),
num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
}
algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
write.delim(algnSummary, file="tophat_mapping_stats.txt")
# algnSummary <- read.delim("algnSummary.txt")
#' [Tophat Mapping Statistics](tophat_mapping_stats.txt)
scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
ggplot(algnSummary, aes(condition, mapping_efficiency)) +
geom_bar(stat="identity") +
coord_flip() +
ylim(0,100) +
ggtitle("mapping efficiency")
ggplot(algnSummary, aes(condition, num_reads)) +
geom_bar(stat="identity") +
coord_flip() +
ggtitle("read counts") +scale_y_continuous(labels=comma)
ggplot(algnSummary, aes(condition, mapped_reads)) +
geom_bar(stat="identity") + coord_flip() +
ggtitle("alignments counts") +
scale_y_continuous(labels=comma)
#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")
##todo multimapper counting
########################################################################################################################
#' ## Alignment Correlation
#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
#' Used tool [deepTools](https://github.com/fidelram/deepTools)
### todo integrate bam correlation analysis
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment