cutadapt_summary.R 7.03 KB
Newer Older
Melanie Schneider's avatar
Melanie Schneider committed
1
#!/usr/bin/env Rscript
2
#' # Quality Control Summary for `r getwd()`
Melanie Schneider's avatar
Melanie Schneider committed
3 4 5 6
##+ echo=FALSE, message=FALSE

## Note This script is supposed to be knitr::spin'ed

7 8
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/ggplot_commons.R")
9
load_pack(knitr)
Melanie Schneider's avatar
Melanie Schneider committed
10 11 12 13 14 15 16 17 18 19

## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)

argv = commandArgs(TRUE)
#echo("argv is ", argv)

#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]

if(length(argv) != 1){
20
  stop("Usage: cutadapt_summary.R <directory with cutadapt log files>")
Melanie Schneider's avatar
Melanie Schneider committed
21 22 23
}

baseDir=argv[1]
24
#baseDir="."
Melanie Schneider's avatar
Melanie Schneider committed
25 26 27 28 29 30


if(is.na(file.info(baseDir)$isdir)){
  stop(paste("directory '", baseDir,"'does not exist"))
}

31
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
Melanie Schneider's avatar
Melanie Schneider committed
32 33


34
logDataFiles <- list.files(path=file.path(baseDir, ".logs"), pattern="__ca__.*.out.log", full.names=TRUE, recursive=T)
Melanie Schneider's avatar
Melanie Schneider committed
35 36 37 38 39 40 41 42 43 44 45

#echo("files are", logDataFiles)

#' ## General information
info1=readLines(pipe( paste("grep -F 'cutadapt' ", logDataFiles[1]) ))
echo(info1)
info2=readLines(pipe( paste("grep -F 'Maximum error rate' ", logDataFiles[1]) ))
echo(info2)
info3=readLines(pipe( paste("grep -F 'No. of adapters' ", logDataFiles[1]) ))
echo(info3)

46
#' ## Cutadapt Parameters:
Melanie Schneider's avatar
Melanie Schneider committed
47
parameters=readLines(pipe( paste("grep -F 'Command line parameters' ", logDataFiles[1]) ))
48 49

# #' `r parameters`
Melanie Schneider's avatar
Melanie Schneider committed
50

Melanie Schneider's avatar
Melanie Schneider committed
51
#' #### Some explanations:
Melanie Schneider's avatar
Melanie Schneider committed
52
if (grepl("-a", parameters) ==TRUE)
Melanie Schneider's avatar
Melanie Schneider committed
53
  echo("-a indicates that the following is a 3' end adapter.")
Melanie Schneider's avatar
Melanie Schneider committed
54
if (grepl("-g", parameters) ==TRUE)
Melanie Schneider's avatar
Melanie Schneider committed
55
  echo("-g indicates that the following is a 5' end adapter.")
Melanie Schneider's avatar
Melanie Schneider committed
56
if (grepl("-b", parameters) ==TRUE)
57
  echo("-b: indicates that the adapter is located at the 3' or 5' end (both possible).")
Melanie Schneider's avatar
Melanie Schneider committed
58 59
if (grepl("-m", parameters) ==TRUE)
  echo("Reads shorter than -m bases are thrown away.")
60
if (grepl("-q:", parameters) ==TRUE)
Melanie Schneider's avatar
Melanie Schneider committed
61
  echo("Quality trimming is done with a threshold specified after -q.")
62
if (grepl("-p:", parameters) ==TRUE)
Melanie Schneider's avatar
Melanie Schneider committed
63
  echo("option 'paired output' is used.")
64
if (grepl("-e:", parameters) ==TRUE)
Melanie Schneider's avatar
Melanie Schneider committed
65
  echo("-e changes the error tolerance. (The default maximum error rate is 0.1)")
66
if (grepl("-O:", parameters) ==TRUE)
Melanie Schneider's avatar
Melanie Schneider committed
67 68 69 70
  echo("The minimum overlap length is changed using -O.")
if (grepl("-N", parameters) ==TRUE)
  echo("Wildcard characters in the adapter are enabled by -N.")

Melanie Schneider's avatar
Melanie Schneider committed
71
#' #### For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html
Melanie Schneider's avatar
Melanie Schneider committed
72 73


74 75
tidySamples  <- function(sample) str_replace(sample, ".*__ca__", "")

76
#' ## Trimming Overview
77
readSummaryStats <- function(logFile){
Melanie Schneider's avatar
Melanie Schneider committed
78
  data.frame(
79
    Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
Melanie Schneider's avatar
Melanie Schneider committed
80 81 82
    No_of_processed_Reads=(paste("grep -F 'Processed reads' ", logFile )  %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
    No_of_processed_Bases=(paste("grep -F 'Processed bases' ", logFile )  %>% pipe() %>% readLines() %>% strsplit( "[^0-9]+") %>% unlist() %>% as.numeric())[2],
    Trimmed_Reads=(paste("grep -F 'Trimmed reads' ", logFile )  %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3],
Melanie Schneider's avatar
Melanie Schneider committed
83
    Quality_trimmed=(paste("grep -F 'Quality-trimmed' ", logFile )  %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[4],
Melanie Schneider's avatar
Melanie Schneider committed
84 85
    Trimmed_Bases=(paste("grep -F 'Trimmed bases' ", logFile )  %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[4],
    Too_short_Reads=(paste("grep -F 'Too short reads' ", logFile )  %>% pipe() %>% readLines() %>% strsplit( "[^0-9\\.]+") %>% unlist() %>% as.numeric())[3]
Melanie Schneider's avatar
Melanie Schneider committed
86 87
  )
}
88
trimmingStats <- logDataFiles %>% map_df(readSummaryStats)
89 90 91 92

#+ results = 'asis'
trimmingStats %>% head() %>% kable()

93
write_tsv(trimmingStats, file="cutadapt_summary.trimmingStats.txt")
94
#'  [Trimming Statistics](cutadapt_summary.trimmingStats.txt)
Melanie Schneider's avatar
Melanie Schneider committed
95

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
myfun <- function(x) x/100
trimmingStats <- mutate_each(trimmingStats, funs(myfun), Trimmed_Reads, Quality_trimmed, Trimmed_Bases, Too_short_Reads)

#+ fig.height=nrow(trimmingStats)/3, fig.width=12
require(scales)
ggplot(trimmingStats, aes(Run, No_of_processed_Reads)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)

ggplot(trimmingStats, aes(Run, Trimmed_Reads)) +
    geom_bar(stat='identity') +
    coord_flip() +
    scale_y_continuous(labels=percent) +
    ## other layer with quality trimmed ones
    geom_bar(aes(Run, Quality_trimmed), stat='identity', color='red', alpha=0.4) +
    ylab("Trimmed reads proportion (quality-related colored in in red)")

#ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)


#ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
ggplot(trimmingStats, aes(Run, Too_short_Reads)) +
    geom_bar(stat='identity') +
    coord_flip() +
    scale_y_continuous(labels=percent) +
    ggtitle("Discarded Reads Proportion")



124
#' ## Adapter trimming statistics
125
readAdapterStats <- function(logFile){
Melanie Schneider's avatar
Melanie Schneider committed
126 127
  #browser()
  data.frame(
128
    Run=sub("^([^.]*).*", "\\1", basename(logFile)) %>% tidySamples,
Melanie Schneider's avatar
Melanie Schneider committed
129 130 131 132
    Adapter=(paste("grep -F '=== Adapter ' ", logFile )  %>% pipe() %>% readLines() %>% str_split_fixed("'", 3))[,2],
    Trimmed=(paste("grep -F '; Trimmed: ' ", logFile )  %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 6))[,5] %>% as.numeric(),
    Overlapped_at_5prime=(paste("grep -F 'overlapped the 5' ", logFile )  %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric(),
    Overlapped_at_3prime=(paste("grep -F 'overlapped the 3' ", logFile )  %>% pipe() %>% readLines() %>% str_split_fixed( "[^0-9]+", 2) )[,1] %>% as.numeric()
Melanie Schneider's avatar
Melanie Schneider committed
133 134
  )
}
135
adapterTrimmingStats <- logDataFiles %>% map_df(readAdapterStats)
136 137 138 139 140

#+ results = 'asis'
adapterTrimmingStats %>% head() %>% kable()

#+
141
write_tsv(adapterTrimmingStats, file="cutadapt_summary.adapterTrimmingStats.txt")
142
#'  [Adapter Statistics](cutadapt_summary.adapterTrimmingStats.txt)
Melanie Schneider's avatar
Melanie Schneider committed
143

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
#with(adapterTrimmingStats, Trimmed==Overlapped_at_3prime+Overlapped_at_5prime)
#ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)

#+ fig.height=nrow(trimmingStats), fig.width=12
adapterTrimmingStats %>% select(-Trimmed) %>% melt() %>%
    mutate(overlap_at=str_replace(variable, "Overlapped_at_", "")) %>%
    ggplot(aes(Run, value, fill=overlap_at)) +
    geom_bar(stat='identity') +
    facet_wrap(~Adapter) +
    coord_flip() +
    scale_y_continuous(labels=comma) +
    ylab("# num reads") +
    ggtitle("Adapter trimming by overlap")