Skip to content
Snippets Groups Projects
Commit 3b6bc737 authored by Holger Brandl's avatar Holger Brandl
Browse files

integrated cutadapt summary report

parent 9dac4f98
No related branches found
No related tags found
No related merge requests found
......@@ -31,7 +31,7 @@ if(is.na(file.info(baseDir)$isdir)){
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles <- list.files(path=baseDir, pattern="ca.fastq.gz.ca.log", full.names=TRUE, recursive=T)
logDataFiles <- list.files(path=baseDir, pattern="__ca__.*.out.log", full.names=TRUE, recursive=T)
#echo("files are", logDataFiles)
......@@ -45,7 +45,8 @@ echo(info3)
#' ## Cutadapt Parameters:
parameters=readLines(pipe( paste("grep -F 'Command line parameters' ", logDataFiles[1]) ))
echo(parameters)
# #' `r parameters`
#' #### Some explanations:
if (grepl("-a", parameters) ==TRUE)
......@@ -53,16 +54,16 @@ if (grepl("-a", parameters) ==TRUE)
if (grepl("-g", parameters) ==TRUE)
echo("-g indicates that the following is a 5' end adapter.")
if (grepl("-b", parameters) ==TRUE)
echo("-b indicates that the adapter is located at the 3' or 5' end (both possible).")
echo("-b: indicates that the adapter is located at the 3' or 5' end (both possible).")
if (grepl("-m", parameters) ==TRUE)
echo("Reads shorter than -m bases are thrown away.")
if (grepl("-q", parameters) ==TRUE)
if (grepl("-q:", parameters) ==TRUE)
echo("Quality trimming is done with a threshold specified after -q.")
if (grepl("-p", parameters) ==TRUE)
if (grepl("-p:", parameters) ==TRUE)
echo("option 'paired output' is used.")
if (grepl("-e", parameters) ==TRUE)
if (grepl("-e:", parameters) ==TRUE)
echo("-e changes the error tolerance. (The default maximum error rate is 0.1)")
if (grepl("-O", parameters) ==TRUE)
if (grepl("-O:", parameters) ==TRUE)
echo("The minimum overlap length is changed using -O.")
if (grepl("-N", parameters) ==TRUE)
echo("Wildcard characters in the adapter are enabled by -N.")
......@@ -70,10 +71,12 @@ if (grepl("-N", parameters) ==TRUE)
#' #### For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html
tidySamples <- function(sample) str_replace(sample, ".*__ca__", "")
#' ## Trimming Overview
genTable1 <- function(logFile){
readSummaryStats <- function(logFile){
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)),
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
No_of_processed_Reads=(paste("grep -F 'Processed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
No_of_processed_Bases=(paste("grep -F 'Processed bases' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
Trimmed_Reads=(paste("grep -F 'Trimmed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3],
......@@ -82,7 +85,7 @@ genTable1 <- function(logFile){
Too_short_Reads=(paste("grep -F 'Too short reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3]
)
}
trimmingStats <- logDataFiles %>% ldply(genTable1)
trimmingStats <- logDataFiles %>% ldply(readSummaryStats)
#+ results = 'asis'
trimmingStats %>% head() %>% kable()
......@@ -90,18 +93,46 @@ trimmingStats %>% head() %>% kable()
write.delim(trimmingStats, file="cutadapt_summary.trimmingStats.txt")
#' [Trimming Statistics](cutadapt_summary.trimmingStats.txt)
myfun <- function(x) x/100
trimmingStats <- mutate_each(trimmingStats, funs(myfun), Trimmed_Reads, Quality_trimmed, Trimmed_Bases, Too_short_Reads)
#+ fig.height=nrow(trimmingStats)/3, fig.width=12
require(scales)
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
ggplot(trimmingStats, aes(Run, Trimmed_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
## other layer with quality trimmed ones
geom_bar(aes(Run, Quality_trimmed), stat='identity', color='red', alpha=0.4) +
ylab("Trimmed reads proportion (quality-related colored in in red)")
#ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
#ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
ggplot(trimmingStats, aes(Run, Too_short_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
ggtitle("Discarded Reads Proportion")
#' ## Adapter trimming statistics
genTable2 <- function(logFile){
readAdapterStats <- function(logFile){
#browser()
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)),
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
Adapter=(paste("grep -F '=== Adapter ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed("'", 3))[,2],
Trimmed=(paste("grep -F '; Trimmed: ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 6))[,5] %>% as.numeric(),
Overlapped_at_5prime=(paste("grep -F 'overlapped the 5' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric(),
Overlapped_at_3prime=(paste("grep -F 'overlapped the 3' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric()
)
}
adapterTrimmingStats <- logDataFiles %>% ldply(genTable2)
adapterTrimmingStats <- logDataFiles %>% ldply(readAdapterStats)
#+ results = 'asis'
adapterTrimmingStats %>% head() %>% kable()
......@@ -110,14 +141,22 @@ adapterTrimmingStats %>% head() %>% kable()
write.delim(adapterTrimmingStats, file="cutadapt_summary.adapterTrimmingStats.txt")
#' [Adapter Statistics](cutadapt_summary.adapterTrimmingStats.txt)
#+ fig.height=10, fig.width=10
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Trimmed_Reads)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip()
ggplot(trimmingStats, aes(Run, Too_short_Reads)) + geom_bar(stat='identity') + coord_flip()
ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip()
ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip()
ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip()
#with(adapterTrimmingStats, Trimmed==Overlapped_at_3prime+Overlapped_at_5prime)
#ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#+ fig.height=nrow(trimmingStats), fig.width=12
adapterTrimmingStats %>% select(-Trimmed) %>% melt() %>%
mutate(overlap_at=str_replace(variable, "Overlapped_at_", "")) %>%
ggplot(aes(Run, value, fill=overlap_at)) +
geom_bar(stat='identity') +
facet_wrap(~Adapter) +
coord_flip() +
scale_y_continuous(labels=comma) +
ylab("# num reads") +
ggtitle("Adapter trimming by overlap")
......@@ -10,6 +10,7 @@ export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH
# which tophat; which bowtie2; which cuffdiff
......@@ -78,7 +79,7 @@ for fastqFile in $* ; do
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub "$project__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
mysub "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
done
wait4jobs .cajobs
......
......@@ -52,7 +52,7 @@ require.auto(scales)
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ results='asis'
gg %>% ggsave2(width=15, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
### Create a faill/pass matrix
......@@ -70,18 +70,15 @@ readSummary <- function(statsFile){
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
#+ fig.height=20
gg <- qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
geom_tile() +
rotXlab() +
scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
ggtitle("fastqc passcodes")
#+ results='asis'
gg %>% ggsave2(width=15, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
#' # Base Quality Distribution Summary
#' Median qualities per position to compare positional patterns and differences between the runs
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment