featcounts_deseq_mf.R 43.5 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env Rscript
Holger Brandl's avatar
Holger Brandl committed
2

Holger Brandl's avatar
Holger Brandl committed
3
4
5
#' # Differential Expression Analysis
#'
#' Contents
Holger Brandl's avatar
Holger Brandl committed
6
#'
Holger Brandl's avatar
Holger Brandl committed
7
8
9
10
11
12
13
14
15
#' - [Data Preparation](#data-preparation)
#' - [Quality Control](#quality-control)
#'     - [Size Factors](#size-factors)
#'     - [Global Dispersion Model](#global-dispersion-model)
#' - [PCA and Clustering](#pca-and-clustering)
#' - [Differential Expression Test](#differential-expression-test)
#'     - [MA and Volcano plots](#ma-and-volcano-plots)
#'     - [Count Outlier Analysis](#count-outlier-analysis)
#' - [Hits Summary](#hits-summary)
Holger Brandl's avatar
Holger Brandl committed
16
#' - [Exported Data](#exported-data)
Holger Brandl's avatar
Holger Brandl committed
17
#'
Holger Brandl's avatar
Holger Brandl committed
18

Holger Brandl's avatar
Holger Brandl committed
19
# to rebuild toc do knitr::spin("featcounts_deseq_mf.R", FALSE)
Holger Brandl's avatar
Holger Brandl committed
20

Holger Brandl's avatar
Holger Brandl committed
21
22
23
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
Holger Brandl's avatar
Holger Brandl committed
24

Holger Brandl's avatar
Holger Brandl committed
25

Holger Brandl's avatar
Holger Brandl committed
26
#+ include=FALSE
Holger Brandl's avatar
Holger Brandl committed
27
28
suppressMessages(require(docopt))

29
doc = '
Holger Brandl's avatar
Holger Brandl committed
30
Perform a differential gene expression analysis using deseq2
Holger Brandl's avatar
Holger Brandl committed
31
32
33
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>

Options:
Holger Brandl's avatar
Holger Brandl committed
34
35
36
37
38
--contrasts=<tab_delim_table>   Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>             Override q-value filter and filter by p-value instead
--min_count <min_count>         Minimal expression in any of the sample to be included in the final result list [default: 10]
--out <name_prefix>             Name to prefix all generated result files [default: ]
herseman's avatar
herseman committed
39
--design <formula>              Design fomula for DeSeq with contrast attribute at the end [default: condition]
40
--lfc <lfc_cutoff>              Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0]
41
--ensembl_db <ensembl_db>       Ensebmbl db to be used. If not specified inferred from data. TODO implement NONE here!!
42
--gene_info <info_file>         Gene information file downloaded from the ensembl website if bioMart version is not present
43
--betaprior <boolean>           Apply betaprior when running DESeq2 [default: TRUE]
44
--sub_data <boolean>            Subset de_results by contrasts and store the data in a subfolder [default: FALSE]
Holger Brandl's avatar
Holger Brandl committed
45
46
'

47
#commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt")
48
#commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts.txt", "--sub_data", "TRUE", "../alignments/star_counts_matrix.txt", "../basic_design.txt")
herseman's avatar
herseman committed
49

50
opts = docopt(doc, commandArgs(TRUE))
Holger Brandl's avatar
Holger Brandl committed
51
52


Holger Brandl's avatar
Holger Brandl committed
53
54
## load some packages first because of name-space order
library(knitr)
55
56
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
Holger Brandl's avatar
Holger Brandl committed
57
library(DESeq2)
58
#load_pack(readr)
Holger Brandl's avatar
Holger Brandl committed
59
60


Holger Brandl's avatar
Holger Brandl committed
61
62
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/ggplot_commons.R")
63

Holger Brandl's avatar
Holger Brandl committed
64
65
66
## also load common helper for expression data analysis
# source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R"))
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
Holger Brandl's avatar
Holger Brandl committed
67

Holger Brandl's avatar
Holger Brandl committed
68
69

## process input arguments
70
71
72
count_matrix_file = opts$count_matrix
design_matrix_file = opts$design_matrix
contrasts_file = opts$contrasts
73
use_betaPrior = as.logical(opts$betaprior)
74
sub_data = as.logical(opts$sub_data)
75
76
gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
Holger Brandl's avatar
Holger Brandl committed
77

78
resultsBase = if (str_length(opts$out) > 0) paste0(opts$out, ".") else ""
Holger Brandl's avatar
Holger Brandl committed
79

80
81
82
83
84
pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff)
if (is.numeric(pcutoff))opts$qcutoff = NULL

lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc)
Holger Brandl's avatar
Holger Brandl committed
85

Holger Brandl's avatar
Holger Brandl committed
86
## extract the sample for the design
87
designFormula = opts$design
Holger Brandl's avatar
Holger Brandl committed
88
## consider last element of design formula as sample attribute
89
90
91
contrastAttribute = str_split(designFormula, "[+]") %>%
    unlist() %>%
    tail(1)
92
93
# assert(designFormula == "condition" || ! is.null(design_matrix_file), "custom designs are not supported without a design matrix")
assert(str_detect(designFormula, ".*condition$"))
Holger Brandl's avatar
Holger Brandl committed
94

Holger Brandl's avatar
Holger Brandl committed
95

Holger Brandl's avatar
Holger Brandl committed
96
97

#' Run configuration was
98
99
100
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    kable()
Holger Brandl's avatar
Holger Brandl committed
101

102
vec_as_df(unlist(opts)) %>%
103
104
    filter(! str_detect(name, "^[<-]")) %>%
    write_tsv(".run_configuration.txt")
105

Holger Brandl's avatar
Holger Brandl committed
106
107
108
109
110
111
112
113
114
########################################################################################################################
#' ## Data Preparation

#' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input.
#' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene.
#' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability.

#' Working Directory: `r getwd()`

115
countData = read_tsv(count_matrix_file)
Holger Brandl's avatar
Holger Brandl committed
116
117

## save a backup into the current working directory as a reference
Holger Brandl's avatar
Holger Brandl committed
118
countData %>% write_tsv(paste0(resultsBase, "input_counts.txt"))
Holger Brandl's avatar
Holger Brandl committed
119
120

#' Structure of input matrix was
121
glimpse(countData, width = 60)
Holger Brandl's avatar
Holger Brandl committed
122

123
124
125
countMatrix = countData %>%
    column2rownames("ensembl_gene_id") %>%
    as.matrix()
Holger Brandl's avatar
Holger Brandl committed
126
127

# Expression Filtering
128
genesBefore = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
129
countMatrix %<>% filterByExpression(as.integer(opts$min_count))
130
genesAfter = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
131

Holger Brandl's avatar
Holger Brandl committed
132
133
#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`.

Holger Brandl's avatar
Holger Brandl committed
134
## todo this is required now , so why all the checking
135
if (! is.null(design_matrix_file)) {
136
    replicates2samples = read_tsv(design_matrix_file) #%>% select(replicate, sample)
137
}else {
herseman's avatar
herseman committed
138
    warning("DEPRECATED extracting conditions from replicates is bad practise and a design file should be provided instead")
139
    get_sample_from_replicate = function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[, 2]
Holger Brandl's avatar
Holger Brandl committed
140

herseman's avatar
herseman committed
141
    replicates2samples = data_frame(replicate = colnames(countMatrix)) %>% mutate(condition = get_sample_from_replicate(replicate)) # %>% distinct_all()
Holger Brandl's avatar
Holger Brandl committed
142
}
Holger Brandl's avatar
Holger Brandl committed
143

144
145
write_tsv(replicates2samples, "basic_design.txt")

Holger Brandl's avatar
Holger Brandl committed
146
# Define or load a contrasts matrix
147
if (! is.null(contrasts_file)) {
148
    contrasts = read_tsv(contrasts_file)
149
150
}else {
    assert(! is.null(replicates2samples), "Can not auto-create contrasts without design matrix") ## cant work yet because sample column with have random name
herseman's avatar
herseman committed
151
152
    contrasts = distinct_all(replicates2samples, condition) %>%
        select(condition) %>%
153
        merge(., ., suffixes = c("_1", "_2"), by = NULL) %>%
herseman's avatar
herseman committed
154
        filter(ac(condition_1) > ac(condition_2)) %>%
155
    #        filter(ac(condition_1)!=ac(condition_2)) %>%
Holger Brandl's avatar
Holger Brandl committed
156
        fac2char()
Holger Brandl's avatar
Holger Brandl committed
157
158

    write_tsv(contrasts, paste(resultsBase, "contrasts.txt"))
Holger Brandl's avatar
Holger Brandl committed
159
160
}

Holger Brandl's avatar
Holger Brandl committed
161
162

#' The used design matrix is
Holger Brandl's avatar
Holger Brandl committed
163
datatable(replicates2samples)
Holger Brandl's avatar
Holger Brandl committed
164

Holger Brandl's avatar
Holger Brandl committed
165
166
167
168
169
#' The used design formula is.
designFormula

#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)

Holger Brandl's avatar
Holger Brandl committed
170
#' The contrasts of interest are
Holger Brandl's avatar
Holger Brandl committed
171
172
173
174
175
176
contrasts %>% kable()

## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
177
178
#dds = makeExampleDESeqDataSet()
#dds = estimateSizeFactors( dds )
Holger Brandl's avatar
Holger Brandl committed
179
180
181
#sizeFactors(dds)

## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
Holger Brandl's avatar
Holger Brandl committed
182
183

## old non-mf approach
184
#exDesign = data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
Holger Brandl's avatar
Holger Brandl committed
185
186

## new mf-approach
187
188
exDesign = data_frame(replicate = colnames(countMatrix)) %>%
    mutate(col_index = row_number()) %>%
189
    right_join(replicates2samples, by = "replicate") %>%
190
    arrange(col_index) #%>% transmute(condition=condition_2ndwt) %>% fac2char
Holger Brandl's avatar
Holger Brandl committed
191

192
assert(! any(is.na(exDesign$col_index)), "assay design file is not compatible to count matrix column names")
193

194
195
196
197
198
exDesign %>% ggplot(aes(condition)) +
    geom_bar() +
    ylab("# samples") +
    coord_flip() +
    ggtitle("samples per condition")
199

Holger Brandl's avatar
Holger Brandl committed
200
## infer the sample to be used from the formula
201
#designFormula = "condition_2ndwt + batch"
Holger Brandl's avatar
Holger Brandl committed
202
203


204
#exDesign %<>% rename(sample, condition_2ndwt)
Holger Brandl's avatar
Holger Brandl committed
205

206
#exDesign = replicates2samples %>%  arrange(colnames(countMatrix))
Holger Brandl's avatar
Holger Brandl committed
207
208

# valiadate that contrasts model is compatible with data
209
210
assert(setequal(names(contrasts), c("condition_1", "condition_2")), "contrasts matrix has incorrect column headers. Expected: condition_1, condition_2")

211
212
213
214
215
216
designSamples = as_df(exDesign)[, contrastAttribute] %>%
    unique %>%
    sort
contrastSamples = contrasts %>% gather %$% value %>%
    unique %>%
    sort
217

218
219
220
if (! all(contrastSamples %in% designSamples)) {
    warning("design   : ", paste(designSamples, collapse = ", "))
    warning("contrasts: ", paste(contrastSamples, collapse = ", "))
Holger Brandl's avatar
Holger Brandl committed
221
    stop("experiment design is not consistent with contrasts")
Holger Brandl's avatar
Holger Brandl committed
222
223
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
Holger Brandl's avatar
Holger Brandl committed
224

Holger Brandl's avatar
Holger Brandl committed
225
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
226
227
228
dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds = estimateSizeFactors(dds)
#dds = estimateDispersions(dds)
229
230
231
# dds = DESeq(dds, parallel = T)

dds = DESeq(dds, parallel = T, betaPrior = use_betaPrior)
Holger Brandl's avatar
Holger Brandl committed
232

233

Holger Brandl's avatar
Holger Brandl committed
234
235


Holger Brandl's avatar
Holger Brandl committed
236
237
238
########################################################################################################################
#' ## Quality Control

Holger Brandl's avatar
Holger Brandl committed
239
240
#' ### Size Factors

Holger Brandl's avatar
Holger Brandl committed
241
242
243
244
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
#'
#' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.

Holger Brandl's avatar
Holger Brandl committed
245
sizeFactors(dds) %>%
Holger Brandl's avatar
Holger Brandl committed
246
247
#    set_names(colnames(countMatrix)) %>%
    vec_as_df("replicate", "size_factor") %>%
248
249
    ggplot(aes(replicate, size_factor)) +
    geom_bar(stat = "identity") +
Holger Brandl's avatar
Holger Brandl committed
250
    ggtitle("Size Factors estimated by DESeq") +
251
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
252
253
254
255
256

# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.



Holger Brandl's avatar
Holger Brandl committed
257
#' ### Global Dispersion Model
Holger Brandl's avatar
Holger Brandl committed
258
259
260
261
262
263

#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
Holger Brandl's avatar
Holger Brandl committed
264
plotDispEsts(dds, main = "Dispersion Model")
265

Holger Brandl's avatar
Holger Brandl committed
266
267


Holger Brandl's avatar
Holger Brandl committed
268
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
269
#' ## PCA and Clustering
Holger Brandl's avatar
Holger Brandl committed
270
271

#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
herseman's avatar
herseman committed
272
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment's design?
Holger Brandl's avatar
Holger Brandl committed
273
274
275
276
277
278
279
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.


#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.

# Regularized log transformation for clustering/heatmaps, etc
280
rld = rlogTransformation(dds)
281
# vst = vst(dds) ## vst is supposed to be much 1k x faster
Holger Brandl's avatar
Holger Brandl committed
282

Holger Brandl's avatar
Holger Brandl committed
283
284
# extract rlogTransformation normalized count data
norm_counts <- assay(rld)
Holger Brandl's avatar
Holger Brandl committed
285
286


287
288
289
load_pack(limma)
load_pack(stringi)
load_pack(htmltools)
290
291
load_pack(plotly)
# install.packages("htmlwidgets")
292
293
294
295
296
297
298
299

#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
300
effects = designFormula %>% str_split("[+]") %>% unlist
301
302
303
304
305
306
307
308
309


htmltools::tagList(lapply(effects, function(batchFactor) {
    # batchFactor=effects[1]
    condition = pull(exDesign, batchFactor)
    names(condition) = exDesign$replicate
    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
    {
        makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
310
        guides(color = guide_legend(title = batchFactor))
311
312
313
314
315
    } %>% ggplotly()
}))


# extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame
Lena Hersemann's avatar
Lena Hersemann committed
316
317
318
319
320
adjvar <- designFormula %>%
    str_split("[+]") %>%
    unlist %>%
    stri_subset_regex("condition", negate = T)
adjvar_data <- data.frame(exDesign[, which(names(exDesign) %in% head(adjvar))])
321
322
323



324
325
treatDesign = exDesign %$% model.matrix(~ condition)

Lena Hersemann's avatar
Lena Hersemann committed
326
if (length(adjvar_data) == 1) {
327
    corr_data <- limma::removeBatchEffect(norm_counts, batch = adjvar_data[, 1], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
328
}else if (length(adjvar_data) == 2) {
329
    corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1], batch2 = adjvar_data[, 2], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
330
}else if (length(adjvar_data) > 2) {
331
    print("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.")
Lena Hersemann's avatar
Lena Hersemann committed
332
}else {
333
    print("The design formula does not contain an adjustment variable for batch effect correction")
334
335
336
337
338
339
340
341
342
343
344
}


# for (i in colnames(adjvar_data)) {
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
345
346
347
if (exists("corr_data")) {
    htmltools::tagList(lapply(effects, function(batchFactor) {
        # batchFactor=effects[1]
348
        # batchFactor="condition"
Lena Hersemann's avatar
Lena Hersemann committed
349
350
351
352
        condition = pull(exDesign, batchFactor)
        names(condition) = exDesign$replicate
        # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
        {
353
            makePcaPlot(t(corr_data), color_by = condition, scale = F, title = "DESeq normalized count data WITH batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
354
355
356
357
358
359
            guides(color = guide_legend(title = batchFactor))
        } %>% ggplotly()
    }))
}else {
    print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
360
361


Holger Brandl's avatar
Holger Brandl committed
362
363
364
365
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis

366
367
368
369
370
#distsRL = dist(t(assay(rld)))
#hc = hclust(distsRL)
#distMatrix = as.matrix(distsRL)
distMatrix = as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) = colnames(distMatrix) = with(exDesign(dds), paste(condition, sep=" : "))
Holger Brandl's avatar
Holger Brandl committed
371

Holger Brandl's avatar
Holger Brandl committed
372
countData %>% distinct(ensembl_gene_id)
373
labelcntData = countData %>%
374
375
376
377
378
    distinct_all(ensembl_gene_id) %>%
    column2rownames("ensembl_gene_id") %>%
    data.matrix() %>%
    round() %>%
    data.frame()
Holger Brandl's avatar
Holger Brandl committed
379

380
381
382
# load_pack(gplots)
# heatmap.2(distMatrix, labRow = colnames(labelcntData), labCol = colnames(labelcntData),
# symm = TRUE, trace = "none",
383
#key=F, col=colorpanel(100, "black", "white"),
384
# margin = c(8, 8), main = "Sample Distance Matrix")
Holger Brandl's avatar
Holger Brandl committed
385

386
387
rownames(distMatrix) = colnames(labelcntData)
colnames(distMatrix) = colnames(labelcntData)
Holger Brandl's avatar
Holger Brandl committed
388
389
390
391

# load_pack(d3heatmap)
load_pack(heatmaply)

Holger Brandl's avatar
Holger Brandl committed
392
393
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
Lena Hersemann's avatar
Lena Hersemann committed
394
print("DESeq normalized count data WITHOUT batch correction")
Holger Brandl's avatar
Holger Brandl committed
395
396
# distMatrix %>% d3heatmap(xaxis_height = 1)
distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Holger Brandl's avatar
Holger Brandl committed
397

Holger Brandl's avatar
Holger Brandl committed
398

Lena Hersemann's avatar
Lena Hersemann committed
399
400
401
if (exists("corr_data")) {
    print("DESeq normalized count data WITH batch correction")
    corr_distMatrix = as.matrix(dist(t(corr_data)))
Holger Brandl's avatar
Holger Brandl committed
402
403
    # corr_distMatrix %>% d3heatmap(xaxis_height = 1)
    corr_distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Lena Hersemann's avatar
Lena Hersemann committed
404
405
406
}else {
    # print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
Holger Brandl's avatar
Holger Brandl committed
407

408
409
## see http://www.bioconductor.org/help/workflows/rnaseqGene/
## DEBUG-START
410
# if (F) {
Holger Brandl's avatar
Holger Brandl committed
411
# assay(rld) %>% head
412
413
load_pack("genefilter")
load_pack("pheatmap")
414
415
mostSIg <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[mostSIg,]
416
417
418
419
420
421
422
mat <- mat - rowMeans(mat)

# df <- as.data.frame(colData(rld)[,c("cell","dex")])
#pheatmap(mat, annotation_col=df)
# pheatmap(mat)
pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index))
# }
423
424
## DEBUG-END

Holger Brandl's avatar
Holger Brandl committed
425

Holger Brandl's avatar
Holger Brandl committed
426
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
427
#' ## Differential Expression Test
Holger Brandl's avatar
Holger Brandl committed
428
429
430
431
432
433
434
435
436
437
438
439
#'
#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
#' DESeq uses a model based on the negative binomial distribution and offers the following features:
#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly.
#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.
#'
#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details.
#'
#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

Holger Brandl's avatar
Holger Brandl committed
440
441


Holger Brandl's avatar
Holger Brandl committed
442
# Run Deseq Test
443
444
445
#dds = DESeq(dds, fitType='local', betaPrior=FALSE)
#dds = DESeq(dds, fitType='local')
#dds = DESeq(dds)
Holger Brandl's avatar
Holger Brandl committed
446

447
#res = results(dds)
Holger Brandl's avatar
Holger Brandl committed
448
449
450
#resultsNames(dds)

#' Model Overview:
Holger Brandl's avatar
Holger Brandl committed
451

Holger Brandl's avatar
Holger Brandl committed
452
453
454
455
summary(results(dds))


## extract all de-sets according to our contrasts model
456
457
# install_package("plyr")
# load_pack(IHW) --> todo reenable once installation on macos is fixed (see https://support.bioconductor.org/p/92731/)
458

Holger Brandl's avatar
Holger Brandl committed
459
460
461
462
# low count filtering https://support.bioconductor.org/p/65256/
# independent filtering occurs within results() to save you from multiple test correction on genes with no power
# (see ?results and the vignette section about independent filtering, or the paper). T

463
#deResults = alply(contrasts, 1,  splat(function(condition_1, condition_2){
herseman's avatar
herseman committed
464
deResults = plyr::alply(contrasts, 1, plyr::splat(function(condition_1, condition_2){
Lena Hersemann's avatar
Lena Hersemann committed
465
    #DEBUG condition_1=as_df(contrasts)[1,1]; condition_2=as_df(contrasts)[1,2]
466
    # results(dds, contrast=c(contrastAttribute, condition_1, condition_2)) %>%
467
468
    # results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff, filterFun = ihw) %>%
    results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff) %>%
469
470
        as_tibble() %>%
        tibble::rownames_to_column("ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
471
    ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
472
        rename(c1_over_c2_logfc = log2FoldChange) %>%
herseman's avatar
herseman committed
473
        mutate(condition_1 = condition_1, condition_2 = condition_2) #%>% head %>% print_head
474
})) %>% bind_rows
Holger Brandl's avatar
Holger Brandl committed
475

Holger Brandl's avatar
Holger Brandl committed
476

477
478
# deResults = adply(contrasts, 1,  splat(function(condition_1, condition_2){
#  echo('samples', condition_1, condition_2)
Holger Brandl's avatar
Holger Brandl committed
479
480
481
#   return(data.frame())
# }))

482
deResults %>% ggplot(aes(c1_over_c2_logfc)) +
483
    geom_histogram(binwidth = 0.1) +
herseman's avatar
herseman committed
484
    facet_grid(condition_1 ~ condition_2) +
485
486
    geom_vline(xintercept = 0, color = "blue") +
    xlim(- 3, 3) +
487
    ggtitle("condition_1 over condition_2 logFC ")
Holger Brandl's avatar
Holger Brandl committed
488
489
490
491
492


#deResults %>% ggplot(aes(pvalue)) +
#    geom_histogram() +
#    xlim(0,1) +
493
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
494
495
496
497
498
#    ggtitle("p-value distributions") #+ scale_x_log10()
#
#deResults %>% ggplot(aes(padj)) +
#    geom_histogram() +
##    xlim(0,1) +
499
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
500
501
502
503
504
#    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


# report hit criterion
#+ results='asis'
505
if (! is.null(qcutoff)) {
Holger Brandl's avatar
Holger Brandl committed
506
    echo("Using q-value cutoff of", qcutoff)
507
508
    deResults %<>% transform(is_hit = padj <= qcutoff)
}else {
Holger Brandl's avatar
Holger Brandl committed
509
    echo("Using p-value cutoff of", pcutoff)
510
    deResults %<>% transform(is_hit = pvalue <= pcutoff)
Holger Brandl's avatar
Holger Brandl committed
511
512
513
514
}
#deResults %<>% mutate(is_hit=pvalue<0.05)


515
# filter(deResults, ensembl_gene_id=="ENSMUSG00000034957")
Holger Brandl's avatar
Holger Brandl committed
516

517
deResults %<>% mutate(c1_overex = c1_over_c2_logfc > 0)
518
519
520


normCounts = counts(dds, normalized = TRUE) %>%
Holger Brandl's avatar
Holger Brandl committed
521
#    set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
522
523
524
    set_names(colnames(countMatrix)) %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df
Holger Brandl's avatar
Holger Brandl committed
525
526


527
normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_counts_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
528
529
530
531
532

## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
533
#' ### MA and Volcano plots
Holger Brandl's avatar
Holger Brandl committed
534

herseman's avatar
herseman committed
535
#' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis ("M" for minus, because a log ratio is equal to log minus log, and "A" for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True).
Holger Brandl's avatar
Holger Brandl committed
536
537
538
539
540
541
#' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call.

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

# Calculation of the base means per condition
Holger Brandl's avatar
Holger Brandl committed
542
543
544
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html


545
meanNormCounts = counts(dds, normalized = TRUE) %>%
546
547
    as_df %>%
    rownames_to_column("ensembl_gene_id") %>%
548
549
550
551
    tbl_df %>%
    gather(replicate, norm_count, - ensembl_gene_id) %>%
    inner_join(exDesign) %>%
    group_by(ensembl_gene_id, condition) %>%
552
553
    summarize(mean_norm_count = mean(norm_count)) %>%
    ungroup()
554

Holger Brandl's avatar
Holger Brandl committed
555

herseman's avatar
herseman committed
556
## add base means to diff??ex summary
557
#numResultsBeforeMerge = nrow(deResults)
558
559
deResults = meanNormCounts %>%
    # gather(condition, norm_count, - ensembl_gene_id) %>%
560
    inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2")) %>%
561
#    filter(ac(condition_1)<ac(condition_2)) %>%
562
# add diffex status
563
    inner_join(deResults)
Holger Brandl's avatar
Holger Brandl committed
564
565

#+ fig.width=16, fig.height=14
566
deResults %>% ggplot(aes(0.5 * log2(mean_norm_count_1 * mean_norm_count_2), log2(mean_norm_count_2 / mean_norm_count_1), color = pvalue < 0.05)) +
567
568
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = 0, color = "red") +
herseman's avatar
herseman committed
569
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
570
571
572
573
574
575
576
577
578
579
580


#deResults %$% pvalue %>%  log10() %>% quantile(0.05, na.rm=T)

#' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test).
#' This scatter plot is used to quickly identify changes in large datasets composed of replicate data.
#' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis.
#' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center.
#' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top).
#' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29)

581
maxX = quantile(deResults$c1_over_c2_logfc, c(0.005, 0.99), na.rm = TRUE) %>%
582
583
584
    abs %>%
    max

Holger Brandl's avatar
Holger Brandl committed
585
##Volcano plots
586
hitCounts = filter(deResults, is_hit) %>%
587
    count(condition_1, condition_2, c1_overex) %>%
588
    rename(hits = n) %>%
589
590
591
592
## complement hit counts with directed contrasts without any diffex genes
    right_join(merge(mutate(contrasts), data_frame(c1_overex = c(T, F)), by = NULL)) %>%
    mutate(hits = if_else(is.na(hits), as.integer(0), hits)) %>%
    merge(data.frame(c1_overex = c(T, F), x_pos = c(maxX - maxX * 0.1, - maxX + maxX * 0.1)))
Holger Brandl's avatar
Holger Brandl committed
593

594
hitCounts %<>% mutate(y_pos = quantile(- log10(deResults$pvalue), 0.987, na.rm = TRUE))
Holger Brandl's avatar
Holger Brandl committed
595

Holger Brandl's avatar
Holger Brandl committed
596
#+ fig.width=16, fig.height=14
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612


## todo remove; part of commons v1.41
trim_outliers <- function(values, probs=c(0.05, 0.95)){
    values = deResults$pvalue
    stopifnot(length(probs) == 2)
    quantiles = quantile(values, probs, na.rm = TRUE)

    pmax(quantiles[1], pmin(quantiles[2], values))
}



deResults %>%
    mutate(p_trimmed = trim_outliers(pvalue, c(0.01, 1))) %>%
    filter(! is.na(is_hit)) %>%
613
    ggplot(aes(c1_over_c2_logfc, - log10(p_trimmed), color = is_hit)) +
614
# geom_jitter(alpha = 0.1, position = position_jitter(height = 0.2)) +
Holger Brandl's avatar
Holger Brandl committed
615
    geom_point(alpha = 0.1) +
Holger Brandl's avatar
Holger Brandl committed
616
#    theme_bw() +
617
618
    xlim(- maxX, maxX) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
Holger Brandl's avatar
Holger Brandl committed
619
#    ggtitle("Insm1/Ctrl") +
620
621
622
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
## tweak axis labels
Holger Brandl's avatar
Holger Brandl committed
623
    xlab(expression(log[2]("x/y"))) +
624
    ylab(expression(- log[10]("p value"))) +
Holger Brandl's avatar
Holger Brandl committed
625
626
#  ylim(0,50) +

627
## add hit couts
628
    geom_text(aes(label = hits, x = x_pos, y = y_pos), color = "red", size = 8, data = hitCounts) +
herseman's avatar
herseman committed
629
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
630
631


Holger Brandl's avatar
Holger Brandl committed
632
########################################################################################################################
633
# Count Table Exports
Holger Brandl's avatar
Holger Brandl committed
634
635


Holger Brandl's avatar
Holger Brandl committed
636
## optionally install bioconducor package
Holger Brandl's avatar
Holger Brandl committed
637
install_package("biomaRt")
Holger Brandl's avatar
Holger Brandl committed
638

Holger Brandl's avatar
Holger Brandl committed
639

640

Holger Brandl's avatar
Holger Brandl committed
641
# Load transcriptome annotations needed for results annotation
642

643
644
if (is.null(gene_info_file)) {
    ensembl_dataset = if (! is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl")  else guess_mart(countData$ensembl_gene_id)
645

646
647
648
649
650
651
652
653
654
655
656
657
658
659
    geneInfo = quote({
        ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))??
        #  mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
        ## todo fix this https://support.bioconductor.org/p/74322/
        # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
        # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
        mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)


        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart = mart) %>%
            tbl_df
    }) %>% cache_it("geneInfo")
} else {
660
    geneInfo = read_tsv(gene_info_file)
661
662
663
    colnames(geneInfo)[1] = "ensembl_gene_id"
}

Holger Brandl's avatar
Holger Brandl committed
664

665
666
geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position)
geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description)
herseman's avatar
herseman committed
667
668


Holger Brandl's avatar
Holger Brandl committed
669
## save as reference for downstream analysis
670
write_tsv(geneInfo, path = paste0(resultsBase, "geneInfo.txt"))
Holger Brandl's avatar
Holger Brandl committed
671
672


673
# Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
674
675
fpkms = countMatrix %>%
    as.data.frame %>%
Holger Brandl's avatar
Holger Brandl committed
676
    rownames_to_column("ensembl_gene_id") %>%
677
678
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
679
    left_join(geneLengths, by = "ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
680
681
## calculate normalized counts
    group_by(replicate) %>%
682
    mutate(lib_size = sum(num_tags), fpkm = ((1E9 / lib_size) * (num_tags / gene_length)) %>% round(digits = 3)) %>%
Holger Brandl's avatar
Holger Brandl committed
683
684
685
686
687
688
    ungroup()

#fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)

fpkms %>%
    transmute(ensembl_gene_id, replicate, fpkm) %>%
Holger Brandl's avatar
Holger Brandl committed
689
    spread(replicate, fpkm) %>%
690
691
692
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
693

694
# Aggregate FPKMs by condition and export into tabele
695
fpkms %>%
696
    inner_join(exDesign, by = "replicate") %>%
697
# group_by_(.dots = c(contrastAttribute, "ensembl_gene_id")) %>%
698
699
#     group_by(!!contrastAttribute, ensembl_gene_id) %>%
    group_by(condition, ensembl_gene_id) %>%
700
701
702
703
704
705
    summarize(mean_fpkm = mean(fpkm)) %>%
    ungroup() %>%
    spread(condition, mean_fpkm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_condition.txt"))
Holger Brandl's avatar
Holger Brandl committed
706

707
# [FPKMs as Matrix](`r paste0(resultsBase, "fpkms_by_condition.txt")`)
Holger Brandl's avatar
Holger Brandl committed
708

Holger Brandl's avatar
Holger Brandl committed
709

710
# Export counts per replicate as TPMs (transcript per million)
711
# calculation of tpms and generation of a wide table including gene description and the external gene name
Holger Brandl's avatar
Holger Brandl committed
712

713
714
715
716
717
718
719
720
721
722
tpms = countMatrix %>%
    as.data.frame %>%
    rownames_to_column("ensembl_gene_id") %>%
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
    left_join(geneLengths) %>%
    mutate(length_norm_count = num_tags / gene_length) %>%
    group_by(replicate) %>%
    mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
    ungroup()
Holger Brandl's avatar
Holger Brandl committed
723

724
725
726
727
728
729
tpms %>%
    select(ensembl_gene_id, replicate, tpm) %>%
    spread(replicate, tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "tpms_by_replicate.txt"))
herseman's avatar
herseman committed
730

731
# Also average TPMs per condition and export
732
733
734
735
736
737
738
739
740
tpms %>%
    inner_join(exDesign, by = "replicate") %>%
    group_by(condition, ensembl_gene_id) %>%
# group_by(condition, ensembl_gene_id) %>%
    summarize(mean_tpm = mean(tpm)) %>%
# select(ensembl_gene_id, replicate, gene_name, gene_description, mean_tpm) %>%
    spread(condition, mean_tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
Holger Brandl's avatar
Holger Brandl committed
741
    write_tsv(paste0(resultsBase, "tpms_by_condition.txt"))
herseman's avatar
herseman committed
742
743
744
745
746
747


#-----------------------------



Holger Brandl's avatar
Holger Brandl committed
748
# Export the complete dataset for later analysis
Holger Brandl's avatar
Holger Brandl committed
749
deAnnot = deResults %>% left_join(geneInfo)
750
751
752
753
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
Holger Brandl's avatar
Holger Brandl committed
754

Holger Brandl's avatar
Holger Brandl committed
755

756
write_tsv(deAnnot, path = paste0(resultsBase, "de_results.txt"))
757
#deAnnot = read_tsv(paste0(resultsBase, "de_results.txt"))
758
#  [de_results.txt](`r paste0(resultsBase, "de_results.txt")`)
Holger Brandl's avatar
Holger Brandl committed
759
760

## also export results which we'll always need for pathways overlays
761
762
763
deAnnot %>%
    transmute(
    ensembl_gene_id,
herseman's avatar
herseman committed
764
    contrast = paste(condition_1, "vs", condition_2),
765
    plot_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)
766
767
    ) %>%
    spread(contrast, plot_score) %>%
Holger Brandl's avatar
Holger Brandl committed
768
    write_tsv(paste0(resultsBase, "plot_score_matrix.txt"))
Holger Brandl's avatar
Holger Brandl committed
769
770
771
772

#deAnnot %>% filter(ensembl_gene_id=="FBgn0000015")


773
# subset data according to contrasts
Holger Brandl's avatar
Holger Brandl committed
774
if (sub_data) {
775
776
777
    dir.create("de_results")
    comp <- apply(contrasts, 1, paste0, collapse = "_vs_")
    lapply(comp, function(x) {
778
779
780
781
782
        deAnnot %>%
            mutate(contrast = paste(condition_1, condition_2, sep = "_vs_")) %>%
            filter(grepl(x, contrast)) %>%
            select(- contrast) %>%
            write_tsv(paste("de_results/", x, ".txt", sep = ""))
783
    })
784
}
785
786


Holger Brandl's avatar
Holger Brandl committed
787
788
789
790
791
792
## todo understand purpose and effeect of indpendentFiltering (see https://support.bioconductor.org/p/57128/)
#deResults %>% count(is_hit)
#deAnnot %>% count(is_hit)
#deAnnot %>% count(is.na(padj))


Holger Brandl's avatar
Holger Brandl committed
793

Lena Hersemann's avatar
Lena Hersemann committed
794
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
795
#' ### Count Outlier Analysis
Lena Hersemann's avatar
Lena Hersemann committed
796
797

#' For how many genes did Deseq did not assign a p-value (due to outliers; see section 1.5.3 os [DEseq manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-outlier-detection))
Holger Brandl's avatar
Holger Brandl committed
798
799


Holger Brandl's avatar
Holger Brandl committed
800
801
802
803
804
805
806
807
808
809

# summary(results(dds, cooksCutoff=FALSE))

#' See "3.6 Approach to count outliers" in the [DESEQ2 manual](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details about outlier handling
#'
#' Def: Cook's distance is a measure of how much a single sample is influencing the fitted coefficients for a gene, and a large value of Cook's distance is intended to indicate an outlier count
# par(mar=c(8,5,2,2))
# boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)


Holger Brandl's avatar
Holger Brandl committed
810
811
deAnnot %>%
    filter(is.na(pvalue)) %>%
herseman's avatar
herseman committed
812
    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
813
    count(contrast) %>%
Holger Brandl's avatar
Holger Brandl committed
814
    n_as("genes_with_NA_pvalue") %>%
815
    kable(caption = "Failed comparsions by sample")
Holger Brandl's avatar
Holger Brandl committed
816
817


Lena Hersemann's avatar
Lena Hersemann committed
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
if ((deAnnot %>% filter(is.na(pvalue)) %>% nrow) > 0) {

    na_counts = norm_counts %>%
        as.data.frame() %>%
        rownames_to_column("ensembl_gene_id") %>%
        tbl_df %>%
        gather(replicate, norm_count, - ensembl_gene_id) %>%
        right_join(deAnnot %>%
            filter(is.na(pvalue)) %>%
            select(ensembl_gene_id, condition_1, condition_2)) %>%
    # glimpse
    # filter(ensembl_gene_id=="ENSMUSG00000000120")
        left_join(select(exDesign, replicate, condition)) %>%
        filter(condition_1 == condition | condition_2 == condition)

    na_counts %>%
        filter(ensembl_gene_id %in% unique(ensembl_gene_id)[1 : 6]) %>%
        ggplot(aes(condition, norm_count, label = replicate)) +
        geom_point() +
        geom_text() +
        facet_wrap(~ ensembl_gene_id, scales = "free_y") +
        ggtitle("exmple gene counts by replicate")

    na_counts %>%
        group_by(ensembl_gene_id) %>%
        filter(norm_count == max(norm_count)) %>%
        ggplot(aes(replicate)) +
        geom_bar() +
        coord_flip() +
        facet_grid(condition ~ ., scales = "free_y") +
        ylab("Genes without p-value in which sample has largest count") +
        ggtitle("Which replicates are causing most of the NAs")
}

########################################################################################################################
#' ## Hits Summary

Holger Brandl's avatar
Holger Brandl committed
855
# Extract hits from deseq results
Lena Hersemann's avatar
Lena Hersemann committed
856
857
degs = deAnnot %>% filter(is_hit)

858
# ggplot(degs, aes(paste(condition_1, "vs",  condition_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
Lena Hersemann's avatar
Lena Hersemann committed
859

Holger Brandl's avatar
Holger Brandl committed
860
# degs %>%
861
#     ggplot(aes(paste(condition_1, "vs",  condition_2), fill=c1_overex)) +
Holger Brandl's avatar
Holger Brandl committed
862
863
864
865
866
867
868
#     geom_bar(position="dodge") +
#     xlab(NULL) + ylab("# DBGs") +
#     ggtitle("DBG count summary by direction of expression") +
#     coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
869
#degsAnnot = degs %>%
Holger Brandl's avatar
Holger Brandl committed
870
871
872
873
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write_tsv(paste0(resultsBase, "diffex_genes.txt"))
874
#degs = read_tsv(paste0(resultsBase, "diffex_genes.txt"))
Holger Brandl's avatar
Holger Brandl committed
875
876
877
878
879
880
#' [Differentially Expressed Genes](`r paste0(resultsBase, "diffbind_genes.txt")`)


#unloadNamespace("dplyr"); require(dplyr)

## export slim hit list for downstream enrichment analysis
881
degs %>%
herseman's avatar
herseman committed
882
    transmute(ensembl_gene_id, contrast = paste(condition_1, "vs", condition_2)) %>%
883
    write_tsv(paste0(resultsBase, "degs_by_contrast.txt"))
Holger Brandl's avatar
Holger Brandl committed
884

885
degs %>%
886
    transmute(ensembl_gene_id, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>%
887
888
    write_tsv(paste0(resultsBase, "degs_by_contrast_directed.txt"))

Holger Brandl's avatar
Holger Brandl committed
889
## also export gsea inputs
890
891
892
893
894
895
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue)) %>% #count(is.na(rank_score))
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__undirected.txt"))
Holger Brandl's avatar
Holger Brandl committed
896

897
898
899
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
900
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)) %>%
901
902
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__directed.txt"))
Holger Brandl's avatar
Holger Brandl committed
903
904
905
906
907
908
909
910
911


# ## render interactive hit browser
# #+results='asis'
# degs %>%
#     left_join(geneInfo) %>%
#     mutate(
#         igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
#     ) %>%
912
#     select(c1_over_c2_logfc, pvalue, ensembl_gene_id, condition_1, condition_2, external_gene_name, description, igv) %>%
Holger Brandl's avatar
Holger Brandl committed
913
914
915
# #    kable("html", table.attr = "class='dtable'", escape=F)
#     datatable(escape=F)

Lena Hersemann's avatar
Lena Hersemann committed
916
#+ fig.height=nrow(contrasts)/2+2, eval=nrow(degs)>0
917
918
#ggplot(degs, aes(paste(condition_1, "vs",  condition_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) +
919
920
921
922
923
    geom_bar() +
    xlab(NULL) +
    ylab("# DGEs") +
    ggtitle("DEGs by contrast") +
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
924

925
#with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable()
926
degs %>%
927
    count(condition_1, condition_2, c1_overex) %>%
Holger Brandl's avatar
Holger Brandl committed
928
    mutate(c1_overex = ifelse(c1_overex, ("Up in condition_1"), "Down in condition_1")) %>%
929
    spread(c1_overex, n) %>%
930
    kable()
Holger Brandl's avatar
Holger Brandl committed
931
932
933
934
935
936
937


#'  DEGs (differentially expressed genes) are contained in the following table


#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
#+ results='asis'
Holger Brandl's avatar
Holger Brandl committed
938
939
# if (nrow(degs) < 3000) {
degs %>%
940
941
#inner_join(geneLoci) %>%
    mutate(
942
943
    ensembl = paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=", ensembl_gene_id, "'>", ensembl_gene_id, "</a>"),
    igv = paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name, ":", start_position, "-", end_position, "'>IGV</a>")
944
    ) %>%
945
    select(condition_1, condition_2, c1_over_c2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%