fastqc_summary.R 7.88 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env Rscript
2
#' # Short Read Quality Report
3
#'
4 5 6
#' Working directory was `r getwd()`
#'
#' ## Introduction: FASTQ format
7 8 9
#'
#' A FASTQ file normally uses four lines per sequence. A FASTQ file containing a single sequence might look like this:
#'
10
#' ```
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
#' @SEQ_ID
#' GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
#' +
#' !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
#' ```
#'
#' - Line 1 begins with a '@' character and is followed by a sequence identifier and an optional description    (like a FASTA title line).
#' - Line 2 is the raw sequence letters.
#' - Line 3 begins with a '+' character and is optionally followed by the same sequence identifier (and any description) again.
#' - Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence.
#'
#' ### Quality scores for base calling
#'
#' Phred quality scores Q are defined as a property which is logarithmically related to the base-calling error probabilities P.
#'
#' For example, if Phred assigns a quality score of 30 to a base, the chances that this base is called incorrectly are 1 in 1000.
#'
#' ### FastQC
#'
#' [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis.

Holger Brandl's avatar
Holger Brandl committed
32
##+ echo=FALSE, message=FALSE
33 34 35

## Note This script is supposed to be knitr::spin'ed

36 37
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/ggplot_commons.R")
38

Holger Brandl's avatar
Holger Brandl committed
39 40
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
Holger Brandl's avatar
Holger Brandl committed
41

42
argv = commandArgs(TRUE)
43
# argv = c("fastqc")
Holger Brandl's avatar
Holger Brandl committed
44
#echo("argv is ", argv)
Holger Brandl's avatar
Holger Brandl committed
45

46
#if(str_detect(argv[1], "fastqc_summary")) argv = argv[-1]
Holger Brandl's avatar
Holger Brandl committed
47

48 49 50 51
if(length(argv) != 1){
    stop("Usage: fastqc_summmary.R <directory with fastqc results>")
}

Holger Brandl's avatar
Holger Brandl committed
52
baseDir=argv[1]
Holger Brandl's avatar
Holger Brandl committed
53
#baseDir=normalizePath(".")
Holger Brandl's avatar
Holger Brandl committed
54 55 56


if(is.na(file.info(baseDir)$isdir)){
Holger Brandl's avatar
Holger Brandl committed
57
    stop(paste("directory '", baseDir,"'does not exist"))
Holger Brandl's avatar
Holger Brandl committed
58
}
Holger Brandl's avatar
Holger Brandl committed
59

60 61 62
#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"

Holger Brandl's avatar
Holger Brandl committed
63 64


65
fastqDataFiles = list.files(path=baseDir, pattern="*fastqc.zip", full.names=TRUE, recursive=T)
66

Holger Brandl's avatar
Holger Brandl committed
67
#echo("files are", fastqDataFiles)
68
get_zip_pipe = function(zipFile, fileName){
69 70
    paste0("unzip -p ",zipFile, " ", trim_ext(basename(zipFile), ".zip"), "/", fileName)
}
71 72 73

#' ## Number of reads per run

74
#statsFile=fastqDataFiles[1]
75
readCount = function(statsFile){
76
    data.frame(
77
        run=trim_ext(basename(statsFile), "_fastqc.zip"),
78
        num_reads=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Sequences' | cut -f2 -d'\t'"))))
79 80 81
    )
}

82
readCounts = map_df(fastqDataFiles, readCount)
83

Holger Brandl's avatar
Holger Brandl committed
84

Holger Brandl's avatar
Holger Brandl committed
85
#+ fig.width=12, fig.height=round(nrow(readCounts)/2)
86
ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
87

88 89
# #+ results='asis'
# gg %>%  ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
90 91 92 93


### Create a faill/pass matrix

94
readSummary = function(statsFile){
95 96 97
    # echo("reading", statsFile)
    # statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"

98
    fileMapStats = read.delim(pipe(get_zip_pipe(statsFile, "summary.txt")), header=F) %>%
99
        set_names(c("flag", "score", "run")) %>%
100
        mutate(run=trim_ext(run, ".fastq.gz", "fastq"))
101 102 103 104

    fileMapStats
}

105
qcSummary = map_df(fastqDataFiles, readSummary)
106

107
#' # Base Quality Distribution Summary
Holger Brandl's avatar
Holger Brandl committed
108
#+ fig.height=2+round(nrow(readCounts)/2), fig.width=12
109
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
110 111
    geom_tile() +
    rotXlab() +
112 113 114
    scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
    ggtitle("fastqc passcodes")

115

116 117 118
#' # Read duplication & Library Complexity


119
dupLevels = map_df(fastqDataFiles, function(statsFile){
120 121
   data.frame(
       run=trim_ext(basename(statsFile), c("_fastqc.zip")),
domingue's avatar
domingue committed
122
       dedup_proportion=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Deduplicated Percentage' | cut -f2"))))/100
123 124 125
   )
})

Holger Brandl's avatar
Holger Brandl committed
126
#+ fig.width=12, fig.height=round(nrow(dupLevels)/2)
127 128 129 130 131
ggplot(dupLevels, aes(run, dedup_proportion)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    scale_y_continuous(labels = percent, limits=c(0,1)) +
    ggtitle("unique_reads/total_reads")
132 133 134



Holger Brandl's avatar
Holger Brandl committed
135

136 137 138
#' # Median qualities per position

# Allows to compare positional patterns and differences between the runs
139

140
readBaseQualDist = function(statsFile){
141 142 143
    # statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
    # statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
    # grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
Holger Brandl's avatar
Holger Brandl committed
144
#    echo("reading", statsFile)
145

146
    baseStats = read.delim(pipe(
147
        #http://stackoverflow.com/questions/1946363/how-do-i-display-data-from-the-beginning-of-a-file-until-the-first-occurence-of/1947950#1947950
domingue's avatar
domingue committed
148
        paste(get_zip_pipe(statsFile, "fastqc_data.txt"), " | grep -A200 -F '>>Per base sequence quality' 2>/dev/null |  perl -pe 'last if />>END_MODULE/' | head -n-2 | tail -n+2 | tr '#' ' '")
149
        )) %>% mutate(
150
            run=trim_ext(basename(statsFile), ".zip")
151 152 153 154 155
        )

    baseStats %>% mutate(base_order=1:n())
}

156
baseQualities = map_df(fastqDataFiles, readBaseQualDist) %>% tbl_df
Holger Brandl's avatar
Holger Brandl committed
157
#statsFile="fastqc/L8038_Track-21511_R1.trim_fastqc.zip"
158 159
#with(baseQualities, as.data.frame(table(run)))

160 161
baseQualities %<>% mutate(first_base=map_int(Base, ~ as.integer(str_split(.x, fixed("-"))[[1]][1])))

162

163
#+ fig.widthh=20
164 165 166 167 168 169 170
seqQualPlot = baseQualities %>% ggplot(aes(first_base, Mean, group = run, color = run)) +
    geom_line() +
    scale_y_continuous(limits = c(2, 40)) +
    xlab("base position") +
    ylab("y mean base quality") +
    guides(color=F) +
    ggtitle("Track Base Qualities")
171

Holger Brandl's avatar
Holger Brandl committed
172
## just show color legend if not too many samples
173
if(unlen(baseQualities$run) > 20) seqQualPlot = seqQualPlot + guides(color=F)
174

Holger Brandl's avatar
Holger Brandl committed
175
seqQualPlot
176

177
#' # Qualities per run including variance
178

179
runs = with(baseQualities, as.data.frame(table(run)))
Holger Brandl's avatar
Holger Brandl committed
180

Holger Brandl's avatar
Holger Brandl committed
181
#+ warning=FALSE, fig.width=15, fig.height=3*ceiling(nrow(runs)/2)
Holger Brandl's avatar
Holger Brandl committed
182

183
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
184 185 186
# options(width=100)
# summary(baseQualities)

187
baseQualities %>%
Holger Brandl's avatar
Holger Brandl committed
188
#    head(30) %>%
189
    ggplot() +
190
    geom_blank(mapping=aes(first_base)) +
191
    xlab("base position") +
192
    ylab("y mean base quality") +
193 194 195 196
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
    geom_boxplot(
domingue's avatar
domingue committed
197
            mapping=aes(x=first_base, ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax =  X90th.Percentile, group = first_base),
198 199
             stat = "identity"
     ) +
200
    facet_wrap(~run, ncol=3) +
201
    theme(axis.text.x=element_blank()) +
202
    scale_y_continuous(limits=c(2, 42))
203 204


205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

#' Export QC stats as table

medRunQual = baseQualities %>%
    mutate(run = str_replace_all(run, "_fastqc$", "")) %>%
    group_by(run) %>%
    summarize(median_base_quality = median(Median))

readCounts %>%
    left_join(dupLevels) %>%
    left_join(medRunQual) %>%
    left_join(qcSummary %>% spread(score, flag)) %>%
    write_tsv(file.path(baseDir, "fastqc_stat_summary.txt"))



#'  [fastqc_stat_summary.txt](`r file.path(baseDir, "fastqc_stat_summary.txt")`)