featcounts_deseq_mf.R 53.1 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env Rscript
Holger Brandl's avatar
Holger Brandl committed
2

Holger Brandl's avatar
Holger Brandl committed
3
4
5
#' # Differential Expression Analysis
#'
#' Contents
Holger Brandl's avatar
Holger Brandl committed
6
#'
Holger Brandl's avatar
Holger Brandl committed
7
8
9
10
11
12
13
14
15
#' - [Data Preparation](#data-preparation)
#' - [Quality Control](#quality-control)
#'     - [Size Factors](#size-factors)
#'     - [Global Dispersion Model](#global-dispersion-model)
#' - [PCA and Clustering](#pca-and-clustering)
#' - [Differential Expression Test](#differential-expression-test)
#'     - [MA and Volcano plots](#ma-and-volcano-plots)
#'     - [Count Outlier Analysis](#count-outlier-analysis)
#' - [Hits Summary](#hits-summary)
Holger Brandl's avatar
Holger Brandl committed
16
#' - [Exported Data](#exported-data)
Holger Brandl's avatar
Holger Brandl committed
17
#'
Holger Brandl's avatar
Holger Brandl committed
18

Holger Brandl's avatar
Holger Brandl committed
19
# to rebuild toc do knitr::spin("featcounts_deseq_mf.R", FALSE)
Holger Brandl's avatar
Holger Brandl committed
20

Holger Brandl's avatar
Holger Brandl committed
21
22
23
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
Holger Brandl's avatar
Holger Brandl committed
24

Holger Brandl's avatar
Holger Brandl committed
25

Holger Brandl's avatar
Holger Brandl committed
26
#+ include=FALSE
Holger Brandl's avatar
Holger Brandl committed
27
28
suppressMessages(require(docopt))

29
doc = '
Holger Brandl's avatar
Holger Brandl committed
30
Perform a differential gene expression analysis using deseq2
Holger Brandl's avatar
Holger Brandl committed
31
32
33
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>

Options:
Holger Brandl's avatar
Holger Brandl committed
34
35
36
--contrasts=<tab_delim_table>   Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>             Override q-value filter and filter by p-value instead
37
--fc_thresh_degs <n>            DEGs must have a fold change larger than or equal n or smaller than or equal 1/n; n=0 deactivates this condition [default: 0]
Holger Brandl's avatar
Holger Brandl committed
38
--min_count <min_count>         Minimal expression in any of the sample to be included in the final result list [default: 10]
39
--out <name_prefix>             Name to prefix all generated result files
domingue's avatar
domingue committed
40
--design <formula>              Design formula for DeSeq with contrast attribute at the end [default: condition]
domingue's avatar
domingue committed
41
--lfc <lfc_cutoff>              Provide thresholds for constructing Wald tests of significance. Sets the lfcThreshold argument in the results function. [default: 1.0]
42
--ensembl_db <ensembl_db>       Ensebmbl db to be used. If not specified inferred from data. TODO implement NONE here!!
43
--gene_info <info_file>         Gene information file downloaded from the ensembl website if bioMart version is not present
44
--betaprior <boolean>           Apply betaprior when running DESeq2 [default: TRUE]
45
--sub_data <boolean>            Subset de_results by contrasts and store the data in a subfolder [default: FALSE]
46
--gtf <gtf_file>                Path to a GTF file, preferably the same used for read counting.
47
--genic_counts <genic_file>     Path to a the genic counts file. this should the file *.complete_table.txt generated with dge_workflow/genic_counts.R, but it can be any file that contains three columns named: _ensembl_gene_id_, _sample_, and _nonexonic_ratio_. Sample names should match those of the input count table.
Holger Brandl's avatar
Holger Brandl committed
48
49
'

50
#commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt")
51
#commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts.txt", "--sub_data", "TRUE", "../alignments/star_counts_matrix.txt", "../basic_design.txt")
52
# genic_file <- "rev_stranded.complete_table.txt"
53
opts = docopt(doc, commandArgs(TRUE))
Holger Brandl's avatar
Holger Brandl committed
54
55


Holger Brandl's avatar
Holger Brandl committed
56
57
## load some packages first because of name-space order
library(knitr)
58
59
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
Holger Brandl's avatar
Holger Brandl committed
60
library(DESeq2)
61
#load_pack(readr)
62
library(IHW)
Holger Brandl's avatar
Holger Brandl committed
63
64


65
66
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/ggplot_commons.R")
67
68
69
load_pack(ggpubr)
load_pack(gridExtra)

70

Holger Brandl's avatar
Holger Brandl committed
71
72
73
## also load common helper for expression data analysis
# source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R"))
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
Holger Brandl's avatar
Holger Brandl committed
74

Holger Brandl's avatar
Holger Brandl committed
75
76

## process input arguments
77
78
79
count_matrix_file = opts$count_matrix
design_matrix_file = opts$design_matrix
contrasts_file = opts$contrasts
80
use_betaPrior = as.logical(opts$betaprior)
81
sub_data = as.logical(opts$sub_data)
82
gene_info_file = opts$gene_info
83
gtf_file = opts$gtf
84
genic_file = opts$genic_counts
85
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
86
assert(is.null(gtf_file) || file.exists(gtf_file), "invalid gene_info_file")
Holger Brandl's avatar
Holger Brandl committed
87

88
resultsBase = if (!is.null(opts$out)) paste0(opts$out, ".") else ""
Holger Brandl's avatar
Holger Brandl committed
89

90
fc_thresh_degs=if (is.null(opts$fc_thresh_degs)) 0 else as.numeric(opts$fc_thresh_degs)
91
92
93
94
95
pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff)
if (is.numeric(pcutoff))opts$qcutoff = NULL

lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc)
Holger Brandl's avatar
Holger Brandl committed
96

domingue's avatar
domingue committed
97
98
99
100
101
102
103
# set alpha for the results filtering
if (! is.null(qcutoff)) {
    alphathres <- qcutoff
}else {
    alphathres <- 0.1
}

Holger Brandl's avatar
Holger Brandl committed
104
## extract the sample for the design
105
designFormula = opts$design
Holger Brandl's avatar
Holger Brandl committed
106
## consider last element of design formula as sample attribute
107
108
109
contrastAttribute = str_split(designFormula, "[+]") %>%
    unlist() %>%
    tail(1)
110
111
# assert(designFormula == "condition" || ! is.null(design_matrix_file), "custom designs are not supported without a design matrix")
assert(str_detect(designFormula, ".*condition$"))
Holger Brandl's avatar
Holger Brandl committed
112

Holger Brandl's avatar
Holger Brandl committed
113

Holger Brandl's avatar
Holger Brandl committed
114
115

#' Run configuration was
116
117
118
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    kable()
Holger Brandl's avatar
Holger Brandl committed
119

120
vec_as_df(unlist(opts)) %>%
121
122
    filter(! str_detect(name, "^[<-]")) %>%
    write_tsv(".run_configuration.txt")
123

Holger Brandl's avatar
Holger Brandl committed
124
125
126
127
128
129
130
131
132
########################################################################################################################
#' ## Data Preparation

#' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input.
#' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene.
#' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability.

#' Working Directory: `r getwd()`

133
countData = read_tsv(count_matrix_file)
Holger Brandl's avatar
Holger Brandl committed
134
135

## save a backup into the current working directory as a reference
Holger Brandl's avatar
Holger Brandl committed
136
countData %>% write_tsv(paste0(resultsBase, "input_counts.txt"))
Holger Brandl's avatar
Holger Brandl committed
137
138

#' Structure of input matrix was
139
glimpse(countData, width = 60)
Holger Brandl's avatar
Holger Brandl committed
140

141
142
143
countMatrix = countData %>%
    column2rownames("ensembl_gene_id") %>%
    as.matrix()
Holger Brandl's avatar
Holger Brandl committed
144
145

# Expression Filtering
146
genesBefore = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
147
countMatrix %<>% filterByExpression(as.integer(opts$min_count))
148
genesAfter = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
149

Holger Brandl's avatar
Holger Brandl committed
150
151
#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`.

Holger Brandl's avatar
Holger Brandl committed
152
## todo this is required now , so why all the checking
153
if (! is.null(design_matrix_file)) {
154
    replicates2samples = read_tsv(design_matrix_file) #%>% select(replicate, sample)
155
}else {
herseman's avatar
herseman committed
156
    warning("DEPRECATED extracting conditions from replicates is bad practise and a design file should be provided instead")
157
    get_sample_from_replicate = function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[, 2]
Holger Brandl's avatar
Holger Brandl committed
158

herseman's avatar
herseman committed
159
    replicates2samples = data_frame(replicate = colnames(countMatrix)) %>% mutate(condition = get_sample_from_replicate(replicate)) # %>% distinct_all()
Holger Brandl's avatar
Holger Brandl committed
160
}
Holger Brandl's avatar
Holger Brandl committed
161

162
163
write_tsv(replicates2samples, "basic_design.txt")

Holger Brandl's avatar
Holger Brandl committed
164
# Define or load a contrasts matrix
165
if (! is.null(contrasts_file)) {
166
    contrasts = read_tsv(contrasts_file)
167
168
}else {
    assert(! is.null(replicates2samples), "Can not auto-create contrasts without design matrix") ## cant work yet because sample column with have random name
herseman's avatar
herseman committed
169
170
    contrasts = distinct_all(replicates2samples, condition) %>%
        select(condition) %>%
171
        merge(., ., suffixes = c("_1", "_2"), by = NULL) %>%
herseman's avatar
herseman committed
172
        filter(ac(condition_1) > ac(condition_2)) %>%
173
    #        filter(ac(condition_1)!=ac(condition_2)) %>%
Holger Brandl's avatar
Holger Brandl committed
174
        fac2char()
Holger Brandl's avatar
Holger Brandl committed
175

176
    write_tsv(contrasts, paste0(resultsBase, "contrasts.txt"))
Holger Brandl's avatar
Holger Brandl committed
177
178
}

Holger Brandl's avatar
Holger Brandl committed
179
180

#' The used design matrix is
Holger Brandl's avatar
Holger Brandl committed
181
datatable(replicates2samples)
Holger Brandl's avatar
Holger Brandl committed
182

Holger Brandl's avatar
Holger Brandl committed
183
184
185
186
187
#' The used design formula is.
designFormula

#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)

Holger Brandl's avatar
Holger Brandl committed
188
#' The contrasts of interest are
Holger Brandl's avatar
Holger Brandl committed
189
190
191
192
193
194
contrasts %>% kable()

## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
195
196
#dds = makeExampleDESeqDataSet()
#dds = estimateSizeFactors( dds )
Holger Brandl's avatar
Holger Brandl committed
197
198
199
#sizeFactors(dds)

## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
Holger Brandl's avatar
Holger Brandl committed
200
201

## old non-mf approach
202
#exDesign = data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
Holger Brandl's avatar
Holger Brandl committed
203
204

## new mf-approach
205
206
exDesign = data_frame(replicate = colnames(countMatrix)) %>%
    mutate(col_index = row_number()) %>%
207
    right_join(replicates2samples, by = "replicate") %>%
208
    arrange(col_index) #%>% transmute(condition=condition_2ndwt) %>% fac2char
Holger Brandl's avatar
Holger Brandl committed
209

210
assert(! any(is.na(exDesign$col_index)), "assay design file is not compatible to count matrix column names")
211

212
213
214
215
216
exDesign %>% ggplot(aes(condition)) +
    geom_bar() +
    ylab("# samples") +
    coord_flip() +
    ggtitle("samples per condition")
217

Holger Brandl's avatar
Holger Brandl committed
218
## infer the sample to be used from the formula
219
#designFormula = "condition_2ndwt + batch"
Holger Brandl's avatar
Holger Brandl committed
220
221


222
#exDesign %<>% rename(sample, condition_2ndwt)
Holger Brandl's avatar
Holger Brandl committed
223

224
#exDesign = replicates2samples %>%  arrange(colnames(countMatrix))
Holger Brandl's avatar
Holger Brandl committed
225

226
# validate that contrasts model is compatible with data
227
228
assert(setequal(names(contrasts), c("condition_1", "condition_2")), "contrasts matrix has incorrect column headers. Expected: condition_1, condition_2")

229
230
231
232
233
234
designSamples = as_df(exDesign)[, contrastAttribute] %>%
    unique %>%
    sort
contrastSamples = contrasts %>% gather %$% value %>%
    unique %>%
    sort
235

236
237
238
if (! all(contrastSamples %in% designSamples)) {
    warning("design   : ", paste(designSamples, collapse = ", "))
    warning("contrasts: ", paste(contrastSamples, collapse = ", "))
Holger Brandl's avatar
Holger Brandl committed
239
    stop("experiment design is not consistent with contrasts")
Holger Brandl's avatar
Holger Brandl committed
240
241
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
Holger Brandl's avatar
Holger Brandl committed
242

Holger Brandl's avatar
Holger Brandl committed
243
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
244
245
246
dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds = estimateSizeFactors(dds)
#dds = estimateDispersions(dds)
247
248
249
# dds = DESeq(dds, parallel = T)

dds = DESeq(dds, parallel = T, betaPrior = use_betaPrior)
Holger Brandl's avatar
Holger Brandl committed
250

251

Holger Brandl's avatar
Holger Brandl committed
252
253


Holger Brandl's avatar
Holger Brandl committed
254
255
256
########################################################################################################################
#' ## Quality Control

Holger Brandl's avatar
Holger Brandl committed
257
258
#' ### Size Factors

Holger Brandl's avatar
Holger Brandl committed
259
260
261
262
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
#'
#' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.

Holger Brandl's avatar
Holger Brandl committed
263
sizeFactors(dds) %>%
Holger Brandl's avatar
Holger Brandl committed
264
265
#    set_names(colnames(countMatrix)) %>%
    vec_as_df("replicate", "size_factor") %>%
266
267
    ggplot(aes(replicate, size_factor)) +
    geom_bar(stat = "identity") +
Holger Brandl's avatar
Holger Brandl committed
268
    ggtitle("Size Factors estimated by DESeq") +
269
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
270
271
272
273
274

# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.



Holger Brandl's avatar
Holger Brandl committed
275
#' ### Global Dispersion Model
Holger Brandl's avatar
Holger Brandl committed
276
277
278
279
280
281

#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
Holger Brandl's avatar
Holger Brandl committed
282
plotDispEsts(dds, main = "Dispersion Model")
283

Holger Brandl's avatar
Holger Brandl committed
284
285


Holger Brandl's avatar
Holger Brandl committed
286
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
287
#' ## PCA and Clustering
Holger Brandl's avatar
Holger Brandl committed
288
289

#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
herseman's avatar
herseman committed
290
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment's design?
Holger Brandl's avatar
Holger Brandl committed
291
292
293
294
295
296
297
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.


#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.

# Regularized log transformation for clustering/heatmaps, etc
298
rld = rlogTransformation(dds)
299
# vst = vst(dds) ## vst is supposed to be much 1k x faster
Holger Brandl's avatar
Holger Brandl committed
300

Holger Brandl's avatar
Holger Brandl committed
301
302
# extract rlogTransformation normalized count data
norm_counts <- assay(rld)
Holger Brandl's avatar
Holger Brandl committed
303
304


305
306
307
load_pack(limma)
load_pack(stringi)
load_pack(htmltools)
308
309
load_pack(plotly)
# install.packages("htmlwidgets")
310
311
312
313
314
315
316
317

#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
318
effects = designFormula %>% str_split("[+]") %>% unlist
319
320
321
322
323
324
325
326
327


htmltools::tagList(lapply(effects, function(batchFactor) {
    # batchFactor=effects[1]
    condition = pull(exDesign, batchFactor)
    names(condition) = exDesign$replicate
    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
    {
        makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
328
        guides(color = guide_legend(title = batchFactor))
329
330
331
332
333
    } %>% ggplotly()
}))


# extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame
Lena Hersemann's avatar
Lena Hersemann committed
334
335
336
337
338
adjvar <- designFormula %>%
    str_split("[+]") %>%
    unlist %>%
    stri_subset_regex("condition", negate = T)
adjvar_data <- data.frame(exDesign[, which(names(exDesign) %in% head(adjvar))])
339
340
341



342
343
treatDesign = exDesign %$% model.matrix(~ condition)

Lena Hersemann's avatar
Lena Hersemann committed
344
if (length(adjvar_data) == 1) {
345
    corr_data <- limma::removeBatchEffect(norm_counts, batch = adjvar_data[, 1], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
346
}else if (length(adjvar_data) == 2) {
347
    corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1], batch2 = adjvar_data[, 2], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
348
}else if (length(adjvar_data) > 2) {
349
    print("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.")
Lena Hersemann's avatar
Lena Hersemann committed
350
}else {
351
    print("The design formula does not contain an adjustment variable for batch effect correction")
352
353
354
355
356
357
358
359
360
361
362
}


# for (i in colnames(adjvar_data)) {
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
363
364
365
if (exists("corr_data")) {
    htmltools::tagList(lapply(effects, function(batchFactor) {
        # batchFactor=effects[1]
366
        # batchFactor="condition"
Lena Hersemann's avatar
Lena Hersemann committed
367
368
369
370
        condition = pull(exDesign, batchFactor)
        names(condition) = exDesign$replicate
        # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
        {
371
            makePcaPlot(t(corr_data), color_by = condition, scale = F, title = "DESeq normalized count data WITH batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
372
373
374
375
376
377
            guides(color = guide_legend(title = batchFactor))
        } %>% ggplotly()
    }))
}else {
    print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
378
379


Holger Brandl's avatar
Holger Brandl committed
380
381
382
383
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis

384
385
386
387
388
#distsRL = dist(t(assay(rld)))
#hc = hclust(distsRL)
#distMatrix = as.matrix(distsRL)
distMatrix = as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) = colnames(distMatrix) = with(exDesign(dds), paste(condition, sep=" : "))
Holger Brandl's avatar
Holger Brandl committed
389

Holger Brandl's avatar
Holger Brandl committed
390
countData %>% distinct(ensembl_gene_id)
391
labelcntData = countData %>%
392
393
394
395
396
    distinct_all(ensembl_gene_id) %>%
    column2rownames("ensembl_gene_id") %>%
    data.matrix() %>%
    round() %>%
    data.frame()
Holger Brandl's avatar
Holger Brandl committed
397

398
399
400
# load_pack(gplots)
# heatmap.2(distMatrix, labRow = colnames(labelcntData), labCol = colnames(labelcntData),
# symm = TRUE, trace = "none",
401
#key=F, col=colorpanel(100, "black", "white"),
402
# margin = c(8, 8), main = "Sample Distance Matrix")
Holger Brandl's avatar
Holger Brandl committed
403

404
405
rownames(distMatrix) = colnames(labelcntData)
colnames(distMatrix) = colnames(labelcntData)
Holger Brandl's avatar
Holger Brandl committed
406
407
408
409

# load_pack(d3heatmap)
load_pack(heatmaply)

Holger Brandl's avatar
Holger Brandl committed
410
411
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
Lena Hersemann's avatar
Lena Hersemann committed
412
print("DESeq normalized count data WITHOUT batch correction")
Holger Brandl's avatar
Holger Brandl committed
413
414
# distMatrix %>% d3heatmap(xaxis_height = 1)
distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Holger Brandl's avatar
Holger Brandl committed
415

Holger Brandl's avatar
Holger Brandl committed
416

Lena Hersemann's avatar
Lena Hersemann committed
417
418
419
if (exists("corr_data")) {
    print("DESeq normalized count data WITH batch correction")
    corr_distMatrix = as.matrix(dist(t(corr_data)))
Holger Brandl's avatar
Holger Brandl committed
420
421
    # corr_distMatrix %>% d3heatmap(xaxis_height = 1)
    corr_distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Lena Hersemann's avatar
Lena Hersemann committed
422
423
424
}else {
    # print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
Holger Brandl's avatar
Holger Brandl committed
425

426
427
## see http://www.bioconductor.org/help/workflows/rnaseqGene/
## DEBUG-START
428
# if (F) {
Holger Brandl's avatar
Holger Brandl committed
429
# assay(rld) %>% head
430
431
load_pack("genefilter")
load_pack("pheatmap")
432
433
mostSIg <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[mostSIg,]
434
435
436
437
438
439
440
mat <- mat - rowMeans(mat)

# df <- as.data.frame(colData(rld)[,c("cell","dex")])
#pheatmap(mat, annotation_col=df)
# pheatmap(mat)
pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index))
# }
441
442
## DEBUG-END

Holger Brandl's avatar
Holger Brandl committed
443

Holger Brandl's avatar
Holger Brandl committed
444
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
445
#' ## Differential Expression Test
Holger Brandl's avatar
Holger Brandl committed
446
447
448
449
450
451
452
453
454
455
456
457
#'
#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
#' DESeq uses a model based on the negative binomial distribution and offers the following features:
#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly.
#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.
#'
#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details.
#'
#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

Holger Brandl's avatar
Holger Brandl committed
458
459


Holger Brandl's avatar
Holger Brandl committed
460
# Run Deseq Test
461
462
463
#dds = DESeq(dds, fitType='local', betaPrior=FALSE)
#dds = DESeq(dds, fitType='local')
#dds = DESeq(dds)
Holger Brandl's avatar
Holger Brandl committed
464

465
#res = results(dds)
Holger Brandl's avatar
Holger Brandl committed
466
467
468
#resultsNames(dds)

#' Model Overview:
Holger Brandl's avatar
Holger Brandl committed
469

Holger Brandl's avatar
Holger Brandl committed
470
471
472
473
summary(results(dds))


## extract all de-sets according to our contrasts model
474
475
# install_package("plyr")
# load_pack(IHW) --> todo reenable once installation on macos is fixed (see https://support.bioconductor.org/p/92731/)
476

Holger Brandl's avatar
Holger Brandl committed
477
478
479
480
# low count filtering https://support.bioconductor.org/p/65256/
# independent filtering occurs within results() to save you from multiple test correction on genes with no power
# (see ?results and the vignette section about independent filtering, or the paper). T

481
#deResults = alply(contrasts, 1,  splat(function(condition_1, condition_2){
herseman's avatar
herseman committed
482
deResults = plyr::alply(contrasts, 1, plyr::splat(function(condition_1, condition_2){
Lena Hersemann's avatar
Lena Hersemann committed
483
    #DEBUG condition_1=as_df(contrasts)[1,1]; condition_2=as_df(contrasts)[1,2]
484
    # results(dds, contrast=c(contrastAttribute, condition_1, condition_2)) %>%
domingue's avatar
domingue committed
485
    results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff, filterFun = ihw, alpha = alphathres) %>%
486
    # results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff) %>%
487
488
489
490
491
        as.data.frame() %>%
        mutate(ensembl_gene_id = rownames(.)) %>%
        push_left("ensembl_gene_id") %>%
        # as_tibble() %>%
        # tibble::rownames_to_column("ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
492
    ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
493
        rename(c1_over_c2_logfc = log2FoldChange) %>%
herseman's avatar
herseman committed
494
        mutate(condition_1 = condition_1, condition_2 = condition_2) #%>% head %>% print_head
495
})) %>% bind_rows
Holger Brandl's avatar
Holger Brandl committed
496

Holger Brandl's avatar
Holger Brandl committed
497

498
499
# deResults = adply(contrasts, 1,  splat(function(condition_1, condition_2){
#  echo('samples', condition_1, condition_2)
Holger Brandl's avatar
Holger Brandl committed
500
501
502
#   return(data.frame())
# }))

503
deResults %>% ggplot(aes(c1_over_c2_logfc)) +
504
    geom_histogram(binwidth = 0.1) +
herseman's avatar
herseman committed
505
    facet_grid(condition_1 ~ condition_2) +
506
507
    geom_vline(xintercept = 0, color = "blue") +
    xlim(- 3, 3) +
508
    ggtitle("condition_1 over condition_2 logFC ")
Holger Brandl's avatar
Holger Brandl committed
509
510
511
512
513


#deResults %>% ggplot(aes(pvalue)) +
#    geom_histogram() +
#    xlim(0,1) +
514
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
515
516
517
518
519
#    ggtitle("p-value distributions") #+ scale_x_log10()
#
#deResults %>% ggplot(aes(padj)) +
#    geom_histogram() +
##    xlim(0,1) +
520
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
521
522
523
524
525
#    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


# report hit criterion
#+ results='asis'
526
527
log2_fc_thresh_degs= -Inf
if(fc_thresh_degs!=0){log2_fc_thresh_degs=log2(fc_thresh_degs)}
528
if (! is.null(qcutoff)) {
Holger Brandl's avatar
Holger Brandl committed
529
    echo("Using q-value cutoff of", qcutoff)
530
    deResults %<>% transform(is_hit = padj <= qcutoff & !is.na(c1_over_c2_logfc) & abs(c1_over_c2_logfc)>=log2_fc_thresh_degs)
531
}else {
Holger Brandl's avatar
Holger Brandl committed
532
    echo("Using p-value cutoff of", pcutoff)
533
    deResults %<>% transform(is_hit = pvalue <= pcutoff & !is.na(c1_over_c2_logfc) & abs(c1_over_c2_logfc)>=log2_fc_thresh_degs)
Holger Brandl's avatar
Holger Brandl committed
534
535
536
}
#deResults %<>% mutate(is_hit=pvalue<0.05)

537
538
## if padj or pval can't be conficentely determine by DESeq, they become an NA
## this makes is_na always be TRUE or FALSE
539
deResults %<>% mutate(is_hit = ifelse(is.na(is_hit), FALSE, is_hit))
Holger Brandl's avatar
Holger Brandl committed
540

541
# filter(deResults, ensembl_gene_id=="ENSMUSG00000034957")
Holger Brandl's avatar
Holger Brandl committed
542

543
deResults %<>% mutate(c1_overex = c1_over_c2_logfc > 0)
544
545
546


normCounts = counts(dds, normalized = TRUE) %>%
Holger Brandl's avatar
Holger Brandl committed
547
#    set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
548
549
550
    set_names(colnames(countMatrix)) %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df
Holger Brandl's avatar
Holger Brandl committed
551
552


553
normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_counts_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
554
555
556
557
558

## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
559
#' ### MA and Volcano plots
Holger Brandl's avatar
Holger Brandl committed
560

herseman's avatar
herseman committed
561
#' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis ("M" for minus, because a log ratio is equal to log minus log, and "A" for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True).
562
#' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call. The [ma_plots.pdf](ma_plots.pdf) file contains MA plots colored according to up- and down-regulated genes.
Holger Brandl's avatar
Holger Brandl committed
563
564
565
566
567

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

# Calculation of the base means per condition
Holger Brandl's avatar
Holger Brandl committed
568
569
570
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html


571
meanNormCounts = counts(dds, normalized = TRUE) %>%
572
573
    as_df %>%
    rownames_to_column("ensembl_gene_id") %>%
574
575
576
577
    tbl_df %>%
    gather(replicate, norm_count, - ensembl_gene_id) %>%
    inner_join(exDesign) %>%
    group_by(ensembl_gene_id, condition) %>%
578
579
    summarize(mean_norm_count = mean(norm_count)) %>%
    ungroup()
580

Holger Brandl's avatar
Holger Brandl committed
581

herseman's avatar
herseman committed
582
## add base means to diff??ex summary
583
#numResultsBeforeMerge = nrow(deResults)
584
585
deResults = meanNormCounts %>%
    # gather(condition, norm_count, - ensembl_gene_id) %>%
586
    inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2")) %>%
587
#    filter(ac(condition_1)<ac(condition_2)) %>%
588
# add diffex status
589
    inner_join(deResults)
Holger Brandl's avatar
Holger Brandl committed
590
591

#+ fig.width=16, fig.height=14
592
deResults %>% ggplot(aes(0.5 * log2(mean_norm_count_1 * mean_norm_count_2), log2(mean_norm_count_2 / mean_norm_count_1), color = is_hit)) +
593
594
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = 0, color = "red") +
herseman's avatar
herseman committed
595
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
596
597


598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
## MA plot using ggmapplot() from the ggpubr package (https://rpkgs.datanovia.com/ggpubr/reference/ggmaplot.html)
maPlots <- lapply(unique(paste0(deResults$condition_1, "_vs_", deResults$condition_2)), function(x){
    plotData <- deResults %>% filter(paste0(condition_1, "_vs_", condition_2) == x) %>%
        rename(log2FoldChange = c1_over_c2_logfc)

    plotData %>% ggmaplot(main = paste0(unique(plotData$condition_1), " -> ", unique(plotData$condition_2)),
        fdr = ifelse(! is.null(qcutoff), qcutoff, pcutoff),
        fc = 2^lfc_cutoff,
        size = 0.4,
        #palette = c("#B31B21", "#1465AC", "darkgray"),
        genenames = as.vector(plotData$ensembl_gene_id),
        legend = "top", top = 20,
        font.label = c("bold", 11), label.rectangle = TRUE,
        font.legend = "bold",
        font.main = "bold",
        ggtheme = ggplot2::theme_minimal())
})

pdf("ma_plots.pdf", onefile = TRUE)
for (i in seq(length(maPlots))) {
    print(maPlots[[i]])
}
dev.off()


Holger Brandl's avatar
Holger Brandl committed
623
624
625
626
627
628
629
630
631
#deResults %$% pvalue %>%  log10() %>% quantile(0.05, na.rm=T)

#' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test).
#' This scatter plot is used to quickly identify changes in large datasets composed of replicate data.
#' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis.
#' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center.
#' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top).
#' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29)

632
maxX = quantile(deResults$c1_over_c2_logfc, c(0.005, 0.99), na.rm = TRUE) %>%
633
634
635
    abs %>%
    max

Holger Brandl's avatar
Holger Brandl committed
636
##Volcano plots
637
hitCounts = filter(deResults, is_hit) %>%
638
    count(condition_1, condition_2, c1_overex) %>%
639
    rename(hits = n) %>%
640
641
642
643
## complement hit counts with directed contrasts without any diffex genes
    right_join(merge(mutate(contrasts), data_frame(c1_overex = c(T, F)), by = NULL)) %>%
    mutate(hits = if_else(is.na(hits), as.integer(0), hits)) %>%
    merge(data.frame(c1_overex = c(T, F), x_pos = c(maxX - maxX * 0.1, - maxX + maxX * 0.1)))
Holger Brandl's avatar
Holger Brandl committed
644

645
hitCounts %<>% mutate(y_pos = quantile(- log10(deResults$pvalue), 0.987, na.rm = TRUE))
Holger Brandl's avatar
Holger Brandl committed
646

Holger Brandl's avatar
Holger Brandl committed
647
#+ fig.width=16, fig.height=14
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663


## todo remove; part of commons v1.41
trim_outliers <- function(values, probs=c(0.05, 0.95)){
    values = deResults$pvalue
    stopifnot(length(probs) == 2)
    quantiles = quantile(values, probs, na.rm = TRUE)

    pmax(quantiles[1], pmin(quantiles[2], values))
}



deResults %>%
    mutate(p_trimmed = trim_outliers(pvalue, c(0.01, 1))) %>%
    filter(! is.na(is_hit)) %>%
664
    ggplot(aes(c1_over_c2_logfc, - log10(p_trimmed), color = is_hit)) +
665
# geom_jitter(alpha = 0.1, position = position_jitter(height = 0.2)) +
Holger Brandl's avatar
Holger Brandl committed
666
    geom_point(alpha = 0.1) +
Holger Brandl's avatar
Holger Brandl committed
667
#    theme_bw() +
668
669
    xlim(- maxX, maxX) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
Holger Brandl's avatar
Holger Brandl committed
670
#    ggtitle("Insm1/Ctrl") +
671
672
673
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
## tweak axis labels
Holger Brandl's avatar
Holger Brandl committed
674
    xlab(expression(log[2]("x/y"))) +
675
    ylab(expression(- log[10]("p value"))) +
Holger Brandl's avatar
Holger Brandl committed
676
677
#  ylim(0,50) +

678
## add hit couts
679
    geom_text(aes(label = hits, x = x_pos, y = y_pos), color = "red", size = 8, data = hitCounts) +
herseman's avatar
herseman committed
680
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
681
682


Holger Brandl's avatar
Holger Brandl committed
683
########################################################################################################################
684
# Count Table Exports
685
686
687
688
689
#install_package(c("rtracklayer", "GenomicFeatures"))
load_pack("rtracklayer")
load_pack("GenomicFeatures")
## horrible hack to avoid masking of dplyr::sel
select <- dplyr::select
Holger Brandl's avatar
Holger Brandl committed
690

691
692
## import gtf
# if(!file.exists(gene.model)) stop(paste("GTF File:", gene.model, " does NOT exist. Run with: \n", runstr))
Holger Brandl's avatar
Holger Brandl committed
693

694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
if(!is.null(gtf_file)){
    gtf <- import.gff(
        gtf_file,
        format = "gtf",
        feature.type = "exon"
    )

    txdb <- makeTxDbFromGRanges(gtf)
    exons <- exonsBy(
        txdb,
        by = "gene"
    )

    genes <- genes(txdb) ## genomic coordinates of genes

    ## after filtering only a subset of genes remains
    ## genes in the GRangesList must match the number of genes in dds
    exons <- exons[names(exons) %in% rownames(assay(dds))]
    rowRanges(dds) <- exons
}
Holger Brandl's avatar
Holger Brandl committed
714

715

Holger Brandl's avatar
Holger Brandl committed
716
# Load transcriptome annotations needed for results annotation
717
718
719
attr <- c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position")

if (exists("gtf")) {
720
721
722
723
724
725
726
727
728
729
    geneDescs <- gtf %>% 
        as.data.frame() %>% 
        rename_all(recode,
            gene_id = "ensembl_gene_id",
            gene_name = "external_gene_name",
            gene_description = "description"
        ) %>% 
        select(one_of(c("ensembl_gene_id", "external_gene_name", "description"))) %>%
        distinct()

730
731
732
733
734
735
736
737
738
    geneInfo <- genes %>% 
        as.data.frame() %>% 
        rename_all(recode,
            gene_id = "ensembl_gene_id",
            gene_name = "external_gene_name",
            seqnames = "chromosome_name",
            start = "start_position",
            end = "end_position"
        ) %>% 
739
740
        select(one_of(c("ensembl_gene_id", "chromosome_name", "start_position", "end_position"))) %>%
        left_join(geneDescs)
741
742
743
744

} else if (is.null(gene_info_file)) {
    ## optionally install bioconducor package
    install_package("biomaRt")
745

746
    ensembl_dataset = if (! is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl")  else guess_mart(countData$ensembl_gene_id)
747

748
749
750
751
752
753
754
755
    geneInfo = quote({
        ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))??
        #  mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
        ## todo fix this https://support.bioconductor.org/p/74322/
        # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
        # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
        mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)

756
        attr %>%
757
758
759
760
            biomaRt::getBM(mart = mart) %>%
            tbl_df
    }) %>% cache_it("geneInfo")
} else {
761
    geneInfo = read_tsv(gene_info_file)
762
763
764
    colnames(geneInfo)[1] = "ensembl_gene_id"
}

765
766
767
768
769
770
771
## fix missing fields from GTF
# https://stackoverflow.com/a/49280837/1274242
missing_cols <- attr[!attr %in% colnames(geneInfo)]
if (length(missing_cols) > 0){
    geneInfo <- geneInfo %>% 
        `is.na<-`(missing_cols)
}
Holger Brandl's avatar
Holger Brandl committed
772

773
geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description)
herseman's avatar
herseman committed
774

Holger Brandl's avatar
Holger Brandl committed
775
## save as reference for downstream analysis
776
write_tsv(geneInfo, path = paste0(resultsBase, "geneInfo.txt"))
Holger Brandl's avatar
Holger Brandl committed
777
778


779
780
781
782
783
784
785
786
if (exists("gtf")) {
    fpkm(dds, robust = TRUE) %>%
        round(digits = 3) %>%
        as.data.frame() %>%
        rownames_to_column("ensembl_gene_id") %>% 
        left_join(geneDescs) %>% 
        push_left(c("gene_name", "gene_description")) %>%
        write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
787

788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
} else {
    warning("RPKM calculation uses gene genomic coordinates to define gene length, and not exonic lengths")
    # Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
    geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position)

    fpkms = countMatrix %>%
        as.data.frame %>%
        rownames_to_column("ensembl_gene_id") %>% 
        gather(replicate, num_tags, - ensembl_gene_id) %>%
        tbl_df %>%
        left_join(geneLengths, by = "ensembl_gene_id") %>%
    ## calculate normalized counts
        group_by(replicate) %>%
        mutate(lib_size = sum(num_tags), fpkm = ((1E9 / lib_size) * (num_tags / gene_length)) %>% round(digits = 3)) %>%
        ungroup()

    #fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)

    fpkms %>%
        transmute(ensembl_gene_id, replicate, fpkm) %>%
        spread(replicate, fpkm) %>% 
        left_join(geneDescs) %>% 
        push_left(c("gene_name", "gene_description")) %>% 
        write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt"))

    # Aggregate FPKMs by condition and export into tabele
    fpkms %>%
        inner_join(exDesign, by = "replicate") %>%
    # group_by_(.dots = c(contrastAttribute, "ensembl_gene_id")) %>%
    #     group_by(!!contrastAttribute, ensembl_gene_id) %>%
        group_by(condition, ensembl_gene_id) %>%
        summarize(mean_fpkm = mean(fpkm)) %>%
        ungroup() %>%
        spread(condition, mean_fpkm) %>%
        left_join(geneDescs) %>%
        push_left(c("gene_name", "gene_description")) %>%
        write_tsv(paste0(resultsBase, "fpkms_by_condition.txt"))

    # [FPKMs as Matrix](`r paste0(resultsBase, "fpkms_by_condition.txt")`)
}
828
# Export counts per replicate as TPMs (transcript per million)
829
# calculation of tpms and generation of a wide table including gene description and the external gene name
Holger Brandl's avatar
Holger Brandl committed
830

831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
if (exists("gtf")) {
    ## calculate effective gene length, adapted from the source code of DESeq2::fpkm
    effectiveGeneLengths <- rowRanges(dds) %>%
        GenomicRanges::reduce() %>%
        width() %>%
        sum() %>%
        as.data.frame() %>% 
        rename(gene_length = 1) %>%
        rownames_to_column("ensembl_gene_id")

    tpms = countMatrix %>%
        as.data.frame %>%
        rownames_to_column("ensembl_gene_id") %>%
        gather(replicate, num_tags, - ensembl_gene_id) %>%
        tbl_df %>%
        left_join(effectiveGeneLengths) %>%
        mutate(length_norm_count = num_tags / gene_length) %>%
        group_by(replicate) %>%
        mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
        ungroup()

    tpms %>%
        select(ensembl_gene_id, replicate, tpm) %>%
        spread(replicate, tpm) %>%
        left_join(geneDescs) %>%
        push_left(c("gene_name", "gene_description")) %>% 
        write_tsv(paste0(resultsBase, "tpms_by_replicate.txt"))
herseman's avatar
herseman committed
858

859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
} else {
    tpms = countMatrix %>%
        as.data.frame %>%
        rownames_to_column("ensembl_gene_id") %>%
        gather(replicate, num_tags, - ensembl_gene_id) %>%
        tbl_df %>%
        left_join(geneLengths) %>%
        mutate(length_norm_count = num_tags / gene_length) %>%
        group_by(replicate) %>%
        mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
        ungroup()

    tpms %>%
        select(ensembl_gene_id, replicate, tpm) %>%
        spread(replicate, tpm) %>%
        left_join(geneDescs) %>%
        push_left(c("gene_name", "gene_description")) %>%
        write_tsv(paste0(resultsBase, "tpms_by_replicate.txt"))

    # Also average TPMs per condition and export
    tpms %>%
        inner_join(exDesign, by = "replicate") %>%
        group_by(condition, ensembl_gene_id) %>%
    # group_by(condition, ensembl_gene_id) %>%
        summarize(mean_tpm = mean(tpm)) %>%
    # select(ensembl_gene_id, replicate, gene_name, gene_description, mean_tpm) %>%
        spread(condition, mean_tpm) %>%
        left_join(geneDescs) %>%
        push_left(c("gene_name", "gene_description")) %>%
        write_tsv(paste0(resultsBase, "tpms_by_condition.txt"))
}
herseman's avatar
herseman committed
890
891
892
893
#-----------------------------



Holger Brandl's avatar
Holger Brandl committed
894
# Export the complete dataset for later analysis
Holger Brandl's avatar
Holger Brandl committed
895
deAnnot = deResults %>% left_join(geneInfo)
896
897
898
899
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
Holger Brandl's avatar
Holger Brandl committed
900

901
# Add non-exonic ratio if present
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
if (!is.null(opts$genic_counts)) {
    non_genic_counts <- read_tsv(genic_file)

    ## do we have the same replicates on both tables?
    assert(
        setequal(
            exDesign$replicate,
            unique(non_genic_counts$sample)
        ),
        "Samples in genic_file do not match replicate names in design_file."
    )

    ### Prepare non-genic table
    ## Non-genic counts per condition have to average
    non_genic_counts_wide <- non_genic_counts %>% 
        select(ensembl_gene_id, replicate = sample, nonexonic_ratio) %>%
        left_join(replicates2samples, by = "replicate") %>%
        group_by(ensembl_gene_id, condition) %>%
        summarize(nonexonic_ratio = round(mean(nonexonic_ratio, na.rm = TRUE), 3)) %>%
        inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2"))

    deAnnot <- inner_join(deAnnot, non_genic_counts_wide)
}
Holger Brandl's avatar
Holger Brandl committed
925

926
write_tsv(deAnnot, path = paste0(resultsBase, "de_results.txt"))
927
#deAnnot = read_tsv(paste0(resultsBase, "de_results.txt"))
928
#  [de_results.txt](`r paste0(resultsBase, "de_results.txt")`)
Holger Brandl's avatar
Holger Brandl committed
929
930

## also export results which we'll always need for pathways overlays
931
932
933
deAnnot %>%
    transmute(
    ensembl_gene_id,
herseman's avatar
herseman committed
934
    contrast = paste(condition_1, "vs", condition_2),
935
    plot_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)
936
937
    ) %>%
    spread(contrast, plot_score) %>%
Holger Brandl's avatar
Holger Brandl committed
938
    write_tsv(paste0(resultsBase, "plot_score_matrix.txt"))
Holger Brandl's avatar
Holger Brandl committed
939
940
941
942

#deAnnot %>% filter(ensembl_gene_id=="FBgn0000015")


943
# subset data according to contrasts
Holger Brandl's avatar
Holger Brandl committed
944
if (sub_data) {
945
946
947
    dir.create("de_results")
    comp <- apply(contrasts, 1, paste0, collapse = "_vs_")
    lapply(comp, function(x) {
948
949
950
951
952
        deAnnot %>%
            mutate(contrast = paste(condition_1, condition_2, sep = "_vs_")) %>%
            filter(grepl(x, contrast)) %>%
            select(- contrast) %>%
            write_tsv(paste("de_results/", x, ".txt", sep = ""))
953
    })
954
}
955
956


957
## todo understand purpose and effect of indpendentFiltering (see https://support.bioconductor.org/p/57128/)
Holger Brandl's avatar
Holger Brandl committed
958
959
960
961
962
#deResults %>% count(is_hit)
#deAnnot %>% count(is_hit)
#deAnnot %>% count(is.na(padj))


Holger Brandl's avatar
Holger Brandl committed
963

Lena Hersemann's avatar
Lena Hersemann committed
964
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
965
#' ### Count Outlier Analysis
Lena Hersemann's avatar
Lena Hersemann committed
966
967

#' For how many genes did Deseq did not assign a p-value (due to outliers; see section 1.5.3 os [DEseq manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-outlier-detection))
Holger Brandl's avatar
Holger Brandl committed
968
969


Holger Brandl's avatar
Holger Brandl committed
970
971
972
973
974
975
976
977
978
979

# summary(results(dds, cooksCutoff=FALSE))

#' See "3.6 Approach to count outliers" in the [DESEQ2 manual](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details about outlier handling
#'
#' Def: Cook's distance is a measure of how much a single sample is influencing the fitted coefficients for a gene, and a large value of Cook's distance is intended to indicate an outlier count
# par(mar=c(8,5,2,2))
# boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)


Holger Brandl's avatar
Holger Brandl committed
980
981
deAnnot %>%
    filter(is.na(pvalue)) %>%
herseman's avatar
herseman committed
982
    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
983
    count(contrast) %>%
Holger Brandl's avatar
Holger Brandl committed
984
    n_as("genes_with_NA_pvalue") %>%
985
    kable(caption = "Failed comparsions by sample")
Holger Brandl's avatar
Holger Brandl committed
986
987


Lena Hersemann's avatar
Lena Hersemann committed
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
if ((deAnnot %>% filter(is.na(pvalue)) %>% nrow) > 0) {

    na_counts = norm_counts %>%
        as.data.frame() %>%
        rownames_to_column("ensembl_gene_id") %>%
        tbl_df %>%
        gather(replicate, norm_count, - ensembl_gene_id) %>%
        right_join(deAnnot %>%
            filter(is.na(pvalue)) %>%
            select(ensembl_gene_id, condition_1, condition_2)) %>%
    # glimpse
    # filter(ensembl_gene_id=="ENSMUSG00000000120")
        left_join(select(exDesign, replicate, condition)) %>%
        filter(condition_1 == condition | condition_2 == condition)

    na_counts %>%
        filter(ensembl_gene_id %in% unique(ensembl_gene_id)[1 : 6]) %>%
        ggplot(aes(condition, norm_count, label = replicate)) +
        geom_point() +
        geom_text() +
        facet_wrap(~ ensembl_gene_id, scales = "free_y") +
        ggtitle("exmple gene counts by replicate")

    na_counts %>%
        group_by(ensembl_gene_id) %>%
        filter(norm_count == max(norm_count)) %>%
        ggplot(aes(replicate)) +
        geom_bar() +