Commit a7a09f11 authored by Holger Brandl's avatar Holger Brandl

impl cutadapt report for pe data

parent 4ee94457
#!/usr/bin/env Rscript
#' # Quality Control Summary for `r getwd()`
##+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/ggplot_commons.R")
load_pack(knitr)
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv = commandArgs(TRUE)
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if(length(argv) != 1){
stop("Usage: cutadapt_summary.R <directory with cutadapt log files>")
}
baseDir=argv[1]
#baseDir="."
if(is.na(file.info(baseDir)$isdir)){
stop(paste("directory '", baseDir,"'does not exist"))
}
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles <- list.files(path=file.path(baseDir, ".logs"), pattern="__ca__.*.out.log", full.names=TRUE, recursive=T)
#echo("files are", logDataFiles)
#' ## General information
info1=readLines(pipe( paste("grep -F 'cutadapt' ", logDataFiles[1]) ))
echo(info1)
# info2=readLines(pipe( paste("grep -F 'Maximum error rate' ", logDataFiles[1]) ))
# echo(info2)
info3=readLines(pipe( paste("grep -F 'adapters with' ", logDataFiles[1]) ))
echo(info3)
#' ## Cutadapt Parameters:
parameters=readLines(pipe( paste("grep -F 'Command line parameters' ", logDataFiles[1]) ))
# #' cutadatpt parameters were`r parameters`
#' #### Some explanations:
if (grepl("-a", parameters) ==TRUE)
echo("-a indicates that the following is a 3' end adapter.")
if (grepl("-g", parameters) ==TRUE)
echo("-g indicates that the following is a 5' end adapter.")
if (grepl("-b", parameters) ==TRUE)
echo("-b: indicates that the adapter is located at the 3' or 5' end (both possible).")
if (grepl("-m", parameters) ==TRUE)
echo("Reads shorter than -m bases are thrown away.")
if (grepl("-q:", parameters) ==TRUE)
echo("Quality trimming is done with a threshold specified after -q.")
if (grepl("-p:", parameters) ==TRUE)
echo("option 'paired output' is used.")
if (grepl("-e:", parameters) ==TRUE)
echo("-e changes the error tolerance. (The default maximum error rate is 0.1)")
if (grepl("-O:", parameters) ==TRUE)
echo("The minimum overlap length is changed using -O.")
if (grepl("-N", parameters) ==TRUE)
echo("Wildcard characters in the adapter are enabled by -N.")
#' #### For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html
tidySamples <- function(sample) str_replace(sample, ".*__ca__", "")
#' ## Trimming Overview
readSummaryStats <- function(logFile){
# DEBUG logFile=logDataFiles[1]
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
No_of_processed_Reads=(paste("grep -F 'Total read' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
No_of_processed_Bases=(paste("grep -F 'Total basepairs processed' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
Trimmed_Reads=(paste("grep -F 'Trimmed reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3],
Quality_trimmed=(paste("grep -F 'Quality-trimmed' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[4],
Trimmed_Bases=(paste("grep -F 'Trimmed bases' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[4],
Too_short_Reads=(paste("grep -F 'Too short reads' ", logFile ) %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3]
)
}
trimmingStats <- logDataFiles %>% map_df(readSummaryStats)
#+ results = 'asis'
trimmingStats %>% head() %>% kable()
write_tsv(trimmingStats, file="cutadapt_summary.trimmingStats.txt")
#' [Trimming Statistics](cutadapt_summary.trimmingStats.txt)
myfun <- function(x) x/100
trimmingStats <- mutate_each(trimmingStats, funs(myfun), Trimmed_Reads, Quality_trimmed, Trimmed_Bases, Too_short_Reads)
#+ fig.height=nrow(trimmingStats)/3, fig.width=12
require(scales)
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
ggplot(trimmingStats, aes(Run, Trimmed_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
## other layer with quality trimmed ones
geom_bar(aes(Run, Quality_trimmed), stat='identity', color='red', alpha=0.4) +
ylab("Trimmed reads proportion (quality-related colored in in red)")
#ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
#ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
ggplot(trimmingStats, aes(Run, Too_short_Reads)) +
geom_bar(stat='identity') +
coord_flip() +
scale_y_continuous(labels=percent) +
ggtitle("Discarded Reads Proportion")
#' ## Adapter trimming statistics
readAdapterStats <- function(logFile){
#browser()
data.frame(
Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
Adapter=(paste("grep -F '=== Adapter ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed("'", 3))[,2],
Trimmed=(paste("grep -F '; Trimmed: ' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 6))[,5] %>% as.numeric(),
Overlapped_at_5prime=(paste("grep -F 'overlapped the 5' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric(),
Overlapped_at_3prime=(paste("grep -F 'overlapped the 3' ", logFile ) %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric()
)
}
adapterTrimmingStats <- logDataFiles %>% map_df(readAdapterStats)
#+ results = 'asis'
adapterTrimmingStats %>% head() %>% kable()
#+
write_tsv(adapterTrimmingStats, file="cutadapt_summary.adapterTrimmingStats.txt")
#' [Adapter Statistics](cutadapt_summary.adapterTrimmingStats.txt)
#with(adapterTrimmingStats, Trimmed==Overlapped_at_3prime+Overlapped_at_5prime)
#ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#+ fig.height=nrow(trimmingStats), fig.width=12
adapterTrimmingStats %>% select(-Trimmed) %>% melt() %>%
mutate(overlap_at=str_replace(variable, "Overlapped_at_", "")) %>%
ggplot(aes(Run, value, fill=overlap_at)) +
geom_bar(stat='identity') +
facet_wrap(~Adapter) +
coord_flip() +
scale_y_continuous(labels=comma) +
ylab("# num reads") +
ggtitle("Adapter trimming by overlap")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment