Commit a5fd244b authored by brandl's avatar brandl

cleanup and prepared for handover

parent f26f0d50
Copyright 2017 Max Planck Institute of Molecular Cell Biology and Genetics, Dresden, Germany
Copyright 2018 Max Planck Institute of Molecular Cell Biology and Genetics, Dresden, Germany
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
......
#!/usr/bin/env Rscript
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.4/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.4/R/ggplot_commons.R")
## todo replace with ngs_tools/dge_workflow/diffex_commons.R
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
require.auto(digest)
#' # Region Count Analysis
## spin.R
geneInfo <- quote({
mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>%
cache_it()
## todo use docopt here
countData <- read.delim("replicate_counts.tss_2kb.txt")
names(countData) <- names(countData) %>% str_replace("[.]1", "")
countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()
########################################################################################################################
#' # Differential binding analysis
hasContastsFile=F
if(hasContastsFile){
contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
}else{
contrasts <- data.frame(sample=colnames(countMatrix)) %>%
merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
filter(ac(sample_1)>ac(sample_2)) %>%
# filter(ac(sample_1)!=ac(sample_2)) %>%
fac2char
write.delim(contrasts.txt, "contrasts.txt")
}
#+ results='asis'
#' Used contrasts model is
contrasts %>% kable()
# See deseq [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
require.auto(DESeq2)
require.auto(gplots)
## todo make sure that control comes first to get fold-changes right
#Note: In order to benefit from the default settings of the package, you should put the variable of interest
#at the end of the formula and make sure the control level is the first level.
colData <- data.frame(condition=colnames(countMatrix))
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
#dds <- DESeq(dds)
dds <- DESeq(dds, fitType='local')
res <- results(dds)
resultsNames(dds)
summary(res)
deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# browser()
results(dds, contrast=c("condition",sample_1,sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
mutate(
sample_1=sample_1,
sample_2=sample_2,
sample_1_overex=log2FoldChange<0
)
}), .progress="text")
deResults %>% with(as.data.frame(table(sample_1, sample_2)))
#+ fig.width=20, fig.height=18
deResults %>% ggplot(aes(log2FoldChange)) +
geom_histogram(binwidth=0.1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0, color="blue") +
xlim(-4,4)
deResults %>% ggplot(aes(pvalue)) +
geom_histogram() +
facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0.01, slope=1, color="blue")
### see https://www.biostars.org/p/80448/ for Pairwise Comparison
#res <- results(dds, contrast=c(c("condition","H1M_Dome","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H3K4_Shield","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H1M_Shield", "H3K4_Shield")))
deResults %<>% mutate(is_hit=pvalue<0.01)
write.delim(deResults, file="deResults.txt")
# deResults <- read.delim("deResults.txt")
#' [deResults](deResults.txt)
degs <- deResults %>% filter(is_hit)
ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary by direction of expression") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
#res %>% as.df() %>% ggplot(aes(pvalue)) + geom_histogram()
#res %>% as.df() %>% ggplot(aes(padj)) + geom_histogram()
## todo add gene info
degs %>% inner_join(geneInfo) %>%write.delim(file="degs.txt")
# degs <- read.delim("degs.txt")
#' [degs](degs.txt)
plotMA(res, main="DESeq2", ylim=c(-2,2))
## note see ?DESeq section on 'Experiments without replicates'
## hwo to specify which conditions to run the test on
## http://seqanswers.com/forums/showthread.php?t=28350
#For making comparisons of multiple conditions (not only against the base level of a condition), we have recently implemented contrasts in the development branch. This allows one to fit a single model, then generate log2 fold change estimates, standard errors and tests of null hypotheses for other comparisons.
#+ eval=FALSE, echo=FALSE, include=F
## DEBUG start
if(F){
## DEBUG end
## base means (see http://seqanswers.com/forums/showthread.php?t=28350&page=2)
baseMeanPerLvl <- sapply(levels(colData(dds)$condition), function(lvl) rowMeans(counts(dds,normalized=TRUE)[,colData(dds)$condition == lvl]))
counts(dds,normalized=TRUE) %>% as.df() %>% set_names(colData(dds)$condition) %>%
rownames2column("ensembl_gene_id") %>%
filter(ensembl_gene_id=="ENSDARG00000098036")
countDataNorm %>% filter(ensembl_gene_id=="ENSDARG00000000324", sample %in% c("H3HA_Oblong", "H3HA_Dome"), feature_type==regionTypeDBA)
#baseMean log2FoldChange lfcSE stat pvalue padj ensembl_gene_id sample_1 sample_2 sample_1_overex
#19 12.529275 0.95396312 0.6345064 1.5034728 0.13271717 0.8131601 ENSDARG00000000324 H3HA_Oblong H3HA_Dome FALSE
}
########################################################################################################################
#' ## Term enrichment
#+ echo=FALSE
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
split_hit_list <- F
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <-
dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
head(1) %>% apply(1, paste, collapse="_vs_")
echo("processing", label)
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
ggsave(paste0(label, " enrichmed terms.pdf"))
print(logPlot)
})
#ggsave2()
########################################################################################################################
### Chromosome Sorting
## By default the sorting of samtools faidx is not good, but we can resort it, and samtools sort will respect this sorting order
## To fix existing alginments either use ReorderSam or reheader | resort with samtools
## check consist sorting order
#samtools view /projects/bioinfo/holger/projects/krause_chipseq/alignments_mmfilt/H3HA_Sphere_mmf.bam | cut -f 3 | uniq
#sort -k1,1g -k2,2n $regionModel.bed | cut -f1 | uniq
### --> use reorder sam http://sourceforge.net/p/samtools/mailman/samtools-help/thread/4DB6F0CD.6050403@umdnj.edu/
sort -k1V $bowtieIndex.fa.dict_tmp > $bowtieIndex.dict
samtools view -h $bamFile | removeMultiMappers > ${bamBaseName}_mmf.tmp.bam
samtools index ${bamBaseName}_mmf.tmp.bam # because ReorderSam runs substantially faster if the input is an indexed BAM file.
# Use Picard ResortSam for resorting --> this will sort according to chromosome order in reference fasta only
picard ReorderSam I=${bamBaseName}_mmf.tmp.bam O=${bamBaseName}_mmf.bam REFERENCE=$bowtieIndex.fa
## samtools view -h $bamFile | removeMultiMappers | picard ReorderSam I=/dev/stdin O=${bamBaseName}_mmf.bam REFERENCE=$bowtieIndex.fa
samtools index ${bamBaseName}_mmf.bam
## extract chromosome from bam file
samtools view -bo test.bam /home/brandl/mnt/chip-seq_study/ChIPSeq_February_2014/alignments_trimmed_nomulti_pooled/H2Az.bam 1
########################################################################################################################
## macs2 playground
macs2 -n --broad --gsize
#macs2 -n --gsize
########################################################################################################################
### deeptools
## https://github.com/fidelram/deepTools/wiki/All-command-line-options#bamCorrelate
#bamCorrelate bins --bamfiles $bamFiles --region 10:1:100000 --plotFile="bam_correlation.png" --numberOfProcessors=4 --corMethod spearman
## see how well bam files correlate using untrimmed data
bamCorrelate bins --bamfiles $bamFiles --plotFile="bam_correlation_untrimmed.png" --numberOfProcessors=4 --corMethod spearman
########################################################################################################################
### bam-index all files in directory
#for bamFile in $(ls $baseDir/alignments_trimmed_nomulti_pooled/*.bam); do (samtools index $bamFile ) & done
########################################################################################################################
### Peak calling with SPP
Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c="../alignments_untrimmed/H2Az_Rep1_Lane2_Lib4454.0001.bam" -savp -out="test.txt"
########################################################################################################################
### other qc
## http://www.nature.com/nmeth/journal/v11/n1/full/nmeth.2786.html
# Teytelman et al. discovered that highly expressed loci were always enriched in ChIP peaks, regardless of which protein was pulled dow
#########################################################################################################################
#### CHANCE
mcdir $baseDir/qc_chance
## convert example to sam format
bamToBed -i $baseDir/alignments_trimmed/H2Az_Rep1_Lane1_Lib4454_ca.bam > H2Az_Rep1_Lane1_Lib4454_ca.bed
samtools view -h -o H2Az_Rep1_Lane1_Lib4454_ca.sam $baseDir/alignments_trimmed/H2Az_Rep1_Lane1_Lib4454_ca.bam
/local/home/henry/bin/CHANCE/run_chance_linux.sh /usr/local/MATLAB/MATLAB_Compiler_Runtime/v717/
########################################################################################################################
## Use phantompeakqualtools to calculate quality tag (strand cross-correlation of peak)
# did the antibody-treatment enrich sufficiently so that the ChIP signal can be separated from the background signal?
## --> use https://code.google.com/p/phantompeakqualtools/
mcdir $baseDir/qc_phantompeaks
#Rscript /local/home/brandl/bin/phantompeakqualtools/run_spp.R
#Rscript run_spp.R <bamfile> -savp -out=<outfile>
ctrlBam=$baseDir/alignments_untrimmed/H3-3_Rep1_Lane1_Lib4453.bam
#DEBUG ctrlBam=$baseDir/alignments_untrimmed/H3K4me3_Rep2_Lane2_Lib4455.bam
Rscript /projects/bioinfo/holger/bin/phantompeakqualtools/run_spp.R
for bamFile in $baseDir/alignments_untrimmed/*bam ; do
echo "processing $bamFile..."
# Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -savp -ou t=$(basename $bamFile).pt.log
## one output file is enough because output is tabular including inpuyt file name
# Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -i=$ctrlBam -savp -savd -odir=$(pwd) -out=phantom_qc.txt
## with control
# ( Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -savp -savd -odir=$(pwd) -out=phantom_qc_noctrl.txt &> $(basename $bamFile).pt.log ) &
## without control
# ( Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -savp -savd -odir=$(pwd) -out=phantom_qc_noctrl.txt &> $(basename $bamFile).pt.log ) &
done
wait
## todo visualize results
#phantom output format
#format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>Normalized SCC (NSC)<tab>Relative SCC (RSC)<tab>QualityTag)
mailme "phantom qc done"
########################################################################################################################
### sorted counting
#
## perform the counts analysis (see https://www.biostars.org/p/13980/)
## presort the bed files for faster counting
for bedFile in $(ls *bed | grep -v sortedbam | grep -v sorted | grep -v pooled); do
sort -k1,1 -k2,2n $bedFile > ${bedFile%%.bed}.sorted.bed
done
ll $bamFiles
## convert the bams to sorted bed for fast counting
for bamFile in $bamFiles; do
# DEBUG bamFile=/projects/bioinfo/holger/projects/khan_chipsseq_h1/alignments_mmfilt/H1M_Dome_mmf.bam
bamBaseName=$(basename $(basename $bamFile .bam) _mmf)
mysub "${project}__ib__${bamBaseName}" " bamToBed -i $bamFile | sort -k1,1 -k2,2n > ${bamBaseName}.sortedbam.bed" | joblist .bam2bed
done
wait4jobs .bam2bed
for bamBedFile in $(ls *sortedbam.bed); do
for sortedBed in $(ls *sorted.bed); do
# for sortedBed in zv9_ens79_promoters1kb.sorted.bed; do
# for sortedBed in zv9_ens79_firstex.sorted.bed zv9_ens79_lastex.sorted.bed; do
# DEBUG bamBedFile=H3K4_Shield.sortedbam.bed
# DEBUG sortedBed=zv9_ens79_genebodies.sorted.bed
(
bamBaseName=$(basename $bamBedFile .sortedbam.bed)
bedBaseName=$(basename $sortedBed .sorted.bed | sed 's/zv9_ens79_//g')
## note bam-mode not yet supported for sorted
# mysub "${project}__ib__${bamBaseName}" "
intersectBed -c -sorted -a $sortedBed -b $bamBedFile > ${bamBaseName}.${bedBaseName}.region_counts.txt
# " | joblist .counts
# bamToBed -i $bamFile | sort -k1,1 -k2,2n | intersectBed -c -sorted -a $sortedBed -b stdin > ${bamBaseName}.${bedBaseName}.region_counts.txt
# bamToBed -i $bamFile | sort -k1,1 -k2,2n > ${bamBaseName}.sorted.bed
# intersectBed -c -sorted -a $regionModel.sorted.bed -b ${bamBaseName}.sorted.bed > ${bamBaseName}.region_counts.txt
)&
done
done
wait
#wait4jobs .counts
mailme "${project} counts analysis done"
########################################################################################################################
### motifs
## run meme-chip
########################################################################################################################
### profiles
## try deepTools
## differential binding diffbind
diffReps
"Negative Binomial tests implemented in diffReps (Shen et al. 2013) were used to detect differential histone
modification regions using a sliding window of 600bp (H3K4me3) or 5000bp (H3K27me3), p-value cutoff 1e-
6, sharp peaks mode for H3K4me3 (--nsb 20) and broad peak mode for H3K27me3 (--nsb 2), hg19 as reference
genome, and an average fragment size of 250bp (rest of parameters default). Differential histone modifications
regions not overlapping (at least 1bp) significant chromatin marks previously detected during peak calling at
least in one of conditions under comparison were removed. Regions were ranked by their fold-change (FC), and
reported as differentially enriched only if the absolute FC≥1.5, and Benjamini-Hochberg corrected p-value≤1e-3.
As described on the figures, figure legends and results, for Fig. S6C and S6D only the thresholds for
statistical significance were relaxed as it follows: binomial test p-value cutoff 1e-4 (diffReps standard); absolute
FC≥1.25, and Benjamini-Hochberg corrected p-value≤1e-2 (FDR 1%)."
......@@ -4,6 +4,8 @@
suppressMessages(require(docopt))
# todo refac to use tsv input along with user provided grouping attribute
doc <- '
Perform an enrichment analysis for a set of genes
Usage: david_enrichment.R [options] <grouped_gene_lists_rdata>
......@@ -20,6 +22,10 @@ devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/core_
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/bio/diffex_commons.R")
source(file.path(Sys.getenv("NGS_TOOLS"), "common/david_enrichment_util.R"))
## todo used tagged version instead
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v12/common/david_enrichment_util.RR")
require.auto(knitr)
## to fix child support issue with knitr, see also
......
......@@ -6,6 +6,7 @@
## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D
## complete list can be fetched using getAllAnnotationCategoryNames(david)
DEF_DAVID_ONTOLOGIES=ontologies=c("GOTERM_CC_FAT", "GOTERM_MF_FAT", "GOTERM_BP_FAT", "PANTHER_PATHWAY", "PANTHER_FAMILY", "PANTHER_PATHWAY", "KEGG_PATHWAY", "REACTOME_PATHWAY")
davidAnnotationChart <- function( someGenes, ontologies=DEF_DAVID_ONTOLOGIES ){
......@@ -18,10 +19,12 @@ davidAnnotationChart <- function( someGenes, ontologies=DEF_DAVID_ONTOLOGIES ){
if(length(someGenes)>1500){
# stop("it's up to you to reduce your input to 1500 genes")
warning("trimming david query list to 1500 entries")
someGenes <- sample(someGenes) %>% head(1500)
}
david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")
david<-DAVIDWebService$new(email="bioinformatics@mpi-cbg.de")
# ## list all ontologies
# getAllAnnotationCategoryNames(david)
......
......@@ -8,11 +8,6 @@ Links & Todo
Other enrichment tools
* http://amp.pharm.mssm.edu/Enrichr/
* http://amp.pharm.mssm.edu/Harmonizome/
Other interesting tools to perform enrichment testing
-----------------------------------------------------
* [piano](http://www.bioconductor.org/packages/release/bioc/html/piano.html)
......
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env Rscript
#' # Quality Control Summary for `r getwd()`
##+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/ggplot_commons.R")
load_pack(knitr)
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv = commandArgs(TRUE)
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if(length(argv) != 1){
stop("Usage: cutadapt_summary.R <directory with cutadapt log files>")
}
baseDir=argv[1]
#baseDir="."
if(is.na(file.info(baseDir)$isdir)){
stop(paste("directory '", baseDir,"'does not exist"))
}
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles <- list.files(path=file.path(baseDir, ".logs"), pattern="__ca__.*.out.log", full.names=TRUE, recursive=T)
#echo("files are", logDataFiles)
#' ## General information
info1=readLines(pipe( paste("grep -F 'cutadapt' ", logDataFiles[1]) ))
echo(info1)
info2=readLines(pipe( paste("grep -F 'Maximum error rate' ", logDataFiles[1]) ))
echo(info2)
info3=readLines(pipe( paste("grep -F 'No. of adapters' ", logDataFiles[1]) ))
echo(info3)
#' ## Cutadapt Parameters:
parameters=readLines(pipe( paste("grep -F 'Command line parameters' ", logDataFiles[1]) ))
# #' `r parameters`
#' #### Some explanations:
if (grepl("-a", parameters) ==TRUE)
echo("-a indicates that the following is a 3' end adapter.")
if (grepl("-g", parameters) ==TRUE)
echo("-g indicates that the following is a 5' end adapter.")
if (grepl("-b", parameters) ==TRUE)
echo("-b: indicates that the adapter is located at the 3' or 5' end (both possible).")
if (grepl("-m", parameters) ==TRUE)
echo("Reads shorter than -m bases are thrown away.")
if (grepl("-q:", parameters) ==TRUE)
echo("Quality trimming is done with a threshold specified after -q.")
if (grepl("-p:", parameters) ==TRUE)
echo("option 'paired output' is used.")
if (grepl("-e:", parameters) ==TRUE)
echo("-e changes the error tolerance. (The default maximum error rate is 0.1)")
if (grepl("-O:", parameters) ==TRUE)
echo("The minimum overlap length is changed using -O.")
if (grepl("-N", parameters) ==TRUE)
echo("Wildcard characters in the adapter are enabled by -N.")
#' #### For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html
tidySamples <- function(sample) str_replace(sample, ".*__ca__", "")
#' ## Trimming Overview
readSummaryStats <- function(logFile){
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
No_of_processed_Reads=(paste("grep -F 'Processed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
No_of_processed_Bases=(paste("grep -F 'Processed bases' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
Trimmed_Reads=(paste("grep -F 'Trimmed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3],
Quality_trimmed=(paste("grep -F 'Quality-trimmed' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[4],
Trimmed_Bases=(paste("grep -F 'Trimmed bases' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[4],
Too_short_Reads=(paste("grep -F 'Too short reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3]
)
}
trimmingStats <- logDataFiles %>% map_df(readSummaryStats)
#+ results = 'asis'
trimmingStats %>% head() %>% kable()
write_tsv(trimmingStats, file="cutadapt_summary.trimmingStats.txt")
#' [Trimming Statistics](cutadapt_summary.trimmingStats.txt)
myfun <- function(x) x/100
trimmingStats <- mutate_each(trimmingStats, funs(myfun), Trimmed_Reads, Quality_trimmed, Trimmed_Bases, Too_short_Reads)
#+ fig.height=nrow(trimmingStats)/3, fig.width=12
require(scales)
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
ggplot(trimmingStats, aes(Run, Trimmed_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
## other layer with quality trimmed ones
geom_bar(aes(Run, Quality_trimmed), stat='identity', color='red', alpha=0.4) +
ylab("Trimmed reads proportion (quality-related colored in in red)")
#ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
#ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
ggplot(trimmingStats, aes(Run, Too_short_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
ggtitle("Discarded Reads Proportion")
#' ## Adapter trimming statistics
readAdapterStats <- function(logFile){
#browser()
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
Adapter=(paste("grep -F '=== Adapter ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed("'", 3))[,2],
Trimmed=(paste("grep -F '; Trimmed: ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 6))[,5] %>% as.numeric(),
Overlapped_at_5prime=(paste("grep -F 'overlapped the 5' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric(),
Overlapped_at_3prime=(paste("grep -F 'overlapped the 3' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric()
)
}
adapterTrimmingStats <- logDataFiles %>% map_df(readAdapterStats)
#+ results = 'asis'
adapterTrimmingStats %>% head() %>% kable()
#+
write_tsv(adapterTrimmingStats, file="cutadapt_summary.adapterTrimmingStats.txt")
#' [Adapter Statistics](cutadapt_summary.adapterTrimmingStats.txt)
#with(adapterTrimmingStats, Trimmed==Overlapped_at_3prime+Overlapped_at_5prime)
#ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#+ fig.height=nrow(trimmingStats), fig.width=12
adapterTrimmingStats %>% select(-Trimmed) %>% melt() %>%
mutate(overlap_at=str_replace(variable, "Overlapped_at_", "")) %>%
ggplot(aes(Run, value, fill=overlap_at)) +
geom_bar(stat='identity') +
facet_wrap(~Adapter) +
coord_flip() +
scale_y_continuous(labels=comma) +
ylab("# num reads") +
ggtitle("Adapter trimming by overlap")
#!/usr/bin/env Rscript
#' # Quality Control Summary for `r getwd()`
##+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/ggplot_commons.R")
load_pack(knitr)
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv = commandArgs(TRUE)
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if(length(argv) != 1){
stop("Usage: cutadapt_summary.R <directory with cutadapt log files>")
}
baseDir=argv[1]
#baseDir="."
if(is.na(file.info(baseDir)$isdir)){
stop(paste("directory '", baseDir,"'does not exist"))
}
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles <- list.files(path=file.path(baseDir, ".logs"), pattern="__ca__.*.out.log", full.names=TRUE, recursive=T)
#echo("files are", logDataFiles)
#' ## General information
info1=readLines(pipe( paste("grep -F 'cutadapt' ", logDataFiles[1]) ))
echo(info1)
# info2=readLines(pipe( paste("grep -F 'Maximum error rate' ", logDataFiles[1]) ))
# echo(info2)
info3=readLines(pipe( paste("grep -F 'adapters with' ", logDataFiles[1]) ))
echo(info3)
#' ## Cutadapt Parameters:
parameters=readLines(pipe( paste("grep -F 'Command line parameters' ", logDataFiles[1]) ))
# #' cutadatpt parameters were`r parameters`
#' #### Some explanations:
if (grepl("-a", parameters) ==TRUE)
echo("-a indicates that the following is a 3' end adapter.")
if (grepl("-g", parameters) ==TRUE)
echo("-g indicates that the following is a 5' end adapter.")
if (grepl("-b", parameters) ==TRUE)
echo("-b: indicates that the adapter is located at the 3' or 5' end (both possible).")
if (grepl("-m", parameters) ==TRUE)
echo("Reads shorter than -m bases are thrown away.")
if (grepl("-q:", parameters) ==TRUE)
echo("Quality trimming is done with a threshold specified after -q.")
if (grepl("-p:", parameters) ==TRUE)
echo("option 'paired output' is used.")
if (grepl("-e:", parameters) ==TRUE)
echo("-e changes the error tolerance. (The default maximum error rate is 0.1)")
if (grepl("-O:", parameters) ==TRUE)