Skip to content
Snippets Groups Projects
Commit 8e6e0e04 authored by Holger Brandl's avatar Holger Brandl
Browse files

cont chipseq workflow

parent 37bf0348
No related branches found
No related tags found
No related merge requests found
## enable snippet spinning
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
## configure paths
#python setup_w_cython.py install --prefix /home/brandl/bin/macs2
export PATH=/home/brandl/bin/macs2/bin:$PATH
export PYTHONPATH=/home/brandl/bin/macs2/lib:$PYTHONPATH
export PYTHONPATH=/home/brandl/bin/macs2/lib/python2.7/site-packages:$PYTHONPATH
#/home/brandl/bin/macs2/bin/macs2 -h
cs_bowtie_qc(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
########################################################################################################################
#> # Bowtie2 Mapping Summary
#> Directory: `r getwd()`
logSuffix=".bowtie.log"
parseAlgnSummary <- function(alignSummary){
#alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
algnData <- readLines(alignSummary)
data.frame(
condition=trimEnd(basename(alignSummary), logSuffix),
num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),
unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric(),
multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric()
)
}
algnSummary <- list.files(".", logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
write.delim(algnSummary, file="bowtie2_qc.txt")
scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
#+ fig.height=nrow(algnSummary)
ggplot(algnSummary, aes(condition, mapping_efficiency)) +
geom_bar(stat="identity") +
coord_flip() +
ylim(0,100) +
ggtitle("mapping efficiency")
ggplot(algnSummary, aes(condition, num_reads)) +
geom_bar(stat="identity") +
coord_flip() +
ggtitle("read counts") +scale_y_continuous(labels=comma)
ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
geom_bar(stat="identity") + coord_flip() +
ggtitle("unique-mapper proportions") +
scale_y_continuous(labels=comma)
ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
geom_bar(stat="identity") + coord_flip() +
ggtitle("multi-mapper proportions") +
scale_y_continuous(labels=comma)
#> ## Bam Correlation
#> Using simple genome binning, we can performa a simple clustering/corrlation analysis
## http://stackoverflow.com/questions/291813/recommended-way-to-embed-pdf-in-html
#+ results="asis"
cat("<embed src=\"./bc.pdf\" width=\"900px\" height=\"700px\">")
' | spinsnip "bowtie2_qc"
}
\ No newline at end of file
......@@ -5,19 +5,17 @@ suppressMessages(library(docopt))
# retrieve and parse the command-line arguments
doc <- '
Create a small summary for bam-files in a directory
Create a small summary for bam-files
Usage: bams_qc.R <base_directory>
Options:
-c Peform correlation analysis
'
#print(paste("args are:", commandArgs(TRUE)))
opts <- docopt(doc, commandArgs(TRUE))
#opts <- docopt(doc, ". .")
#opts <- docopt(doc, ".")
if(!exists("opts")){ stop(doc); return }
#print("doc opts are")
#print(opts)
baseDir <- normalizePath(opts$base_directory)
......@@ -26,33 +24,40 @@ if(is.na(file.info(baseDir)$isdir)){
stop(paste("base directory", baseDir, "does not exist"))
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
########################################################################################################################
#' # Mapping Summary for: `r baseDir`
parseAlgnSummary_T2_0_11 <- function(alignSummary){
#alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
logSuffix=".bowtie.log"
parseAlgnSummary <- function(alignSummary){
#alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
algnData <- readLines(alignSummary)
data.frame(
condition=basename(dirname(alignSummary)),
num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
}
condition=trimEnd(basename(alignSummary), logSuffix),
num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),
unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric(),
algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
write.delim(algnSummary, file="tophat_mapping_stats.txt")
# algnSummary <- read.delim("algnSummary.txt")
#' [Tophat Mapping Statistics](tophat_mapping_stats.txt)
multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*") %>% subset(select=2) %>% as.numeric()
)
}
algnSummary <- list.files(baseDir, logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
write.delim(algnSummary, file="bowtie2_qc.txt")
scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
#+ fig.height=nrow(algnSummary)/3
ggplot(algnSummary, aes(condition, mapping_efficiency)) +
geom_bar(stat="identity") +
coord_flip() +
......@@ -64,21 +69,13 @@ ggplot(algnSummary, aes(condition, num_reads)) +
coord_flip() +
ggtitle("read counts") +scale_y_continuous(labels=comma)
ggplot(algnSummary, aes(condition, mapped_reads)) +
ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
geom_bar(stat="identity") + coord_flip() +
ggtitle("alignments counts") +
ggtitle("unique-mapper proportions") +
scale_y_continuous(labels=comma)
ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
geom_bar(stat="identity") + coord_flip() +
ggtitle("multi-mapper proportions") +
scale_y_continuous(labels=comma)
#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")
##todo multimapper counting
########################################################################################################################
#' ## Alignment Correlation
#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
#' Used tool [deepTools](https://github.com/fidelram/deepTools)
### todo integrate bam correlation analysis
\ No newline at end of file
......@@ -6,7 +6,7 @@ suppressMessages(library(docopt))
# retrieve and parse the command-line arguments
doc <- '
Create a small summary alignment summary files created when running tophat2.
Usage: bams_qc.R <base_directory>
Usage: bowtie2_qc.R <base_directory>
Options:
-c Peform correlation analysis
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment