Commit 3b6bc737 authored by Holger Brandl's avatar Holger Brandl

integrated cutadapt summary report

parent 9dac4f98
......@@ -31,7 +31,7 @@ if(is.na(file.info(baseDir)$isdir)){
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles <- list.files(path=baseDir, pattern="ca.fastq.gz.ca.log", full.names=TRUE, recursive=T)
logDataFiles <- list.files(path=baseDir, pattern="__ca__.*.out.log", full.names=TRUE, recursive=T)
#echo("files are", logDataFiles)
......@@ -45,7 +45,8 @@ echo(info3)
#' ## Cutadapt Parameters:
parameters=readLines(pipe( paste("grep -F 'Command line parameters' ", logDataFiles[1]) ))
echo(parameters)
# #' `r parameters`
#' #### Some explanations:
if (grepl("-a", parameters) ==TRUE)
......@@ -53,16 +54,16 @@ if (grepl("-a", parameters) ==TRUE)
if (grepl("-g", parameters) ==TRUE)
echo("-g indicates that the following is a 5' end adapter.")
if (grepl("-b", parameters) ==TRUE)
echo("-b indicates that the adapter is located at the 3' or 5' end (both possible).")
echo("-b: indicates that the adapter is located at the 3' or 5' end (both possible).")
if (grepl("-m", parameters) ==TRUE)
echo("Reads shorter than -m bases are thrown away.")
if (grepl("-q", parameters) ==TRUE)
if (grepl("-q:", parameters) ==TRUE)
echo("Quality trimming is done with a threshold specified after -q.")
if (grepl("-p", parameters) ==TRUE)
if (grepl("-p:", parameters) ==TRUE)
echo("option 'paired output' is used.")
if (grepl("-e", parameters) ==TRUE)
if (grepl("-e:", parameters) ==TRUE)
echo("-e changes the error tolerance. (The default maximum error rate is 0.1)")
if (grepl("-O", parameters) ==TRUE)
if (grepl("-O:", parameters) ==TRUE)
echo("The minimum overlap length is changed using -O.")
if (grepl("-N", parameters) ==TRUE)
echo("Wildcard characters in the adapter are enabled by -N.")
......@@ -70,10 +71,12 @@ if (grepl("-N", parameters) ==TRUE)
#' #### For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html
tidySamples <- function(sample) str_replace(sample, ".*__ca__", "")
#' ## Trimming Overview
genTable1 <- function(logFile){
readSummaryStats <- function(logFile){
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)),
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
No_of_processed_Reads=(paste("grep -F 'Processed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
No_of_processed_Bases=(paste("grep -F 'Processed bases' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
Trimmed_Reads=(paste("grep -F 'Trimmed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3],
......@@ -82,7 +85,7 @@ genTable1 <- function(logFile){
Too_short_Reads=(paste("grep -F 'Too short reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3]
)
}
trimmingStats <- logDataFiles %>% ldply(genTable1)
trimmingStats <- logDataFiles %>% ldply(readSummaryStats)
#+ results = 'asis'
trimmingStats %>% head() %>% kable()
......@@ -90,18 +93,46 @@ trimmingStats %>% head() %>% kable()
write.delim(trimmingStats, file="cutadapt_summary.trimmingStats.txt")
#' [Trimming Statistics](cutadapt_summary.trimmingStats.txt)
myfun <- function(x) x/100
trimmingStats <- mutate_each(trimmingStats, funs(myfun), Trimmed_Reads, Quality_trimmed, Trimmed_Bases, Too_short_Reads)
#+ fig.height=nrow(trimmingStats)/3, fig.width=12
require(scales)
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
ggplot(trimmingStats, aes(Run, Trimmed_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
## other layer with quality trimmed ones
geom_bar(aes(Run, Quality_trimmed), stat='identity', color='red', alpha=0.4) +
ylab("Trimmed reads proportion (quality-related colored in in red)")
#ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
#ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
ggplot(trimmingStats, aes(Run, Too_short_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
ggtitle("Discarded Reads Proportion")
#' ## Adapter trimming statistics
genTable2 <- function(logFile){
readAdapterStats <- function(logFile){
#browser()
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)),
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
Adapter=(paste("grep -F '=== Adapter ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed("'", 3))[,2],
Trimmed=(paste("grep -F '; Trimmed: ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 6))[,5] %>% as.numeric(),
Overlapped_at_5prime=(paste("grep -F 'overlapped the 5' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric(),
Overlapped_at_3prime=(paste("grep -F 'overlapped the 3' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric()
)
}
adapterTrimmingStats <- logDataFiles %>% ldply(genTable2)
adapterTrimmingStats <- logDataFiles %>% ldply(readAdapterStats)
#+ results = 'asis'
adapterTrimmingStats %>% head() %>% kable()
......@@ -110,14 +141,22 @@ adapterTrimmingStats %>% head() %>% kable()
write.delim(adapterTrimmingStats, file="cutadapt_summary.adapterTrimmingStats.txt")
#' [Adapter Statistics](cutadapt_summary.adapterTrimmingStats.txt)
#+ fig.height=10, fig.width=10
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Trimmed_Reads)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Too_short_Reads)) + geom_bar(stat='identity') + coord_flip()
ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip()
ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip()
ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip()
#with(adapterTrimmingStats, Trimmed==Overlapped_at_3prime+Overlapped_at_5prime)
#ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#+ fig.height=nrow(trimmingStats), fig.width=12
adapterTrimmingStats %>% select(-Trimmed) %>% melt() %>%
mutate(overlap_at=str_replace(variable, "Overlapped_at_", "")) %>%
ggplot(aes(Run, value, fill=overlap_at)) +
geom_bar(stat='identity') +
facet_wrap(~Adapter) +
coord_flip() +
scale_y_continuous(labels=comma) +
ylab("# num reads") +
ggtitle("Adapter trimming by overlap")
......@@ -10,6 +10,7 @@ export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH
# which tophat; which bowtie2; which cuffdiff
......@@ -78,7 +79,7 @@ for fastqFile in $* ; do
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub "$project__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
mysub "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
done
wait4jobs .cajobs
......
......@@ -52,7 +52,7 @@ require.auto(scales)
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ results='asis'
gg %>% ggsave2(width=15, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
### Create a faill/pass matrix
......@@ -70,18 +70,15 @@ readSummary <- function(statsFile){
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
#+ fig.height=20
gg <- qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
geom_tile() +
rotXlab() +
scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
ggtitle("fastqc passcodes")
#+ results='asis'
gg %>% ggsave2(width=15, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
#' # Base Quality Distribution Summary
#' Median qualities per position to compare positional patterns and differences between the runs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment