featcounts_deseq_mf.R 43.5 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env Rscript
Holger Brandl's avatar
Holger Brandl committed
2

Holger Brandl's avatar
Holger Brandl committed
3
4
5
#' # Differential Expression Analysis
#'
#' Contents
Holger Brandl's avatar
Holger Brandl committed
6
#'
Holger Brandl's avatar
Holger Brandl committed
7
8
9
10
11
12
13
14
15
#' - [Data Preparation](#data-preparation)
#' - [Quality Control](#quality-control)
#'     - [Size Factors](#size-factors)
#'     - [Global Dispersion Model](#global-dispersion-model)
#' - [PCA and Clustering](#pca-and-clustering)
#' - [Differential Expression Test](#differential-expression-test)
#'     - [MA and Volcano plots](#ma-and-volcano-plots)
#'     - [Count Outlier Analysis](#count-outlier-analysis)
#' - [Hits Summary](#hits-summary)
Holger Brandl's avatar
Holger Brandl committed
16
#' - [Exported Data](#exported-data)
Holger Brandl's avatar
Holger Brandl committed
17
#'
Holger Brandl's avatar
Holger Brandl committed
18

Holger Brandl's avatar
Holger Brandl committed
19
# to rebuild toc do knitr::spin("featcounts_deseq_mf.R", FALSE)
Holger Brandl's avatar
Holger Brandl committed
20

Holger Brandl's avatar
Holger Brandl committed
21
22
23
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
Holger Brandl's avatar
Holger Brandl committed
24

Holger Brandl's avatar
Holger Brandl committed
25

Holger Brandl's avatar
Holger Brandl committed
26
#+ include=FALSE
Holger Brandl's avatar
Holger Brandl committed
27
28
suppressMessages(require(docopt))

29
doc = '
Holger Brandl's avatar
Holger Brandl committed
30
Perform a differential gene expression analysis using deseq2
Holger Brandl's avatar
Holger Brandl committed
31
32
33
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>

Options:
Holger Brandl's avatar
Holger Brandl committed
34
35
36
37
38
--contrasts=<tab_delim_table>   Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>             Override q-value filter and filter by p-value instead
--min_count <min_count>         Minimal expression in any of the sample to be included in the final result list [default: 10]
--out <name_prefix>             Name to prefix all generated result files [default: ]
herseman's avatar
herseman committed
39
--design <formula>              Design fomula for DeSeq with contrast attribute at the end [default: condition]
40
--lfc <lfc_cutoff>              Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0]
41
--ensembl_db <ensembl_db>       Ensebmbl db to be used. If not specified inferred from data. TODO implement NONE here!!
42
--gene_info <info_file>         Gene information file downloaded from the ensembl website if bioMart version is not present
43
--betaprior <boolean>           Apply betaprior when running DESeq2 [default: TRUE]
44
--sub_data <boolean>            Subset de_results by contrasts and store the data in a subfolder [default: FALSE]
Holger Brandl's avatar
Holger Brandl committed
45
46
'

47
#commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt")
48
#commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts.txt", "--sub_data", "TRUE", "../alignments/star_counts_matrix.txt", "../basic_design.txt")
herseman's avatar
herseman committed
49

50
opts = docopt(doc, commandArgs(TRUE))
Holger Brandl's avatar
Holger Brandl committed
51
52


Holger Brandl's avatar
Holger Brandl committed
53
54
## load some packages first because of name-space order
library(knitr)
55
56
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
Holger Brandl's avatar
Holger Brandl committed
57
library(DESeq2)
58
#load_pack(readr)
Holger Brandl's avatar
Holger Brandl committed
59
60


Holger Brandl's avatar
Holger Brandl committed
61
62
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/ggplot_commons.R")
63

Holger Brandl's avatar
Holger Brandl committed
64
65
66
## also load common helper for expression data analysis
# source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R"))
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
Holger Brandl's avatar
Holger Brandl committed
67

Holger Brandl's avatar
Holger Brandl committed
68
69

## process input arguments
70
71
72
count_matrix_file = opts$count_matrix
design_matrix_file = opts$design_matrix
contrasts_file = opts$contrasts
73
use_betaPrior = as.logical(opts$betaprior)
74
sub_data = as.logical(opts$sub_data)
75
76
gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
Holger Brandl's avatar
Holger Brandl committed
77

78
resultsBase = if (str_length(opts$out) > 0) paste0(opts$out, ".") else ""
Holger Brandl's avatar
Holger Brandl committed
79

80
81
82
83
84
pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff)
if (is.numeric(pcutoff))opts$qcutoff = NULL

lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc)
Holger Brandl's avatar
Holger Brandl committed
85

Holger Brandl's avatar
Holger Brandl committed
86
## extract the sample for the design
87
designFormula = opts$design
Holger Brandl's avatar
Holger Brandl committed
88
## consider last element of design formula as sample attribute
89
90
91
contrastAttribute = str_split(designFormula, "[+]") %>%
    unlist() %>%
    tail(1)
92
93
# assert(designFormula == "condition" || ! is.null(design_matrix_file), "custom designs are not supported without a design matrix")
assert(str_detect(designFormula, ".*condition$"))
Holger Brandl's avatar
Holger Brandl committed
94

Holger Brandl's avatar
Holger Brandl committed
95

Holger Brandl's avatar
Holger Brandl committed
96
97

#' Run configuration was
98
99
100
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    kable()
Holger Brandl's avatar
Holger Brandl committed
101

102
vec_as_df(unlist(opts)) %>%
103
104
    filter(! str_detect(name, "^[<-]")) %>%
    write_tsv(".run_configuration.txt")
105

Holger Brandl's avatar
Holger Brandl committed
106
107
108
109
110
111
112
113
114
########################################################################################################################
#' ## Data Preparation

#' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input.
#' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene.
#' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability.

#' Working Directory: `r getwd()`

115
countData = read_tsv(count_matrix_file)
Holger Brandl's avatar
Holger Brandl committed
116
117

## save a backup into the current working directory as a reference
Holger Brandl's avatar
Holger Brandl committed
118
countData %>% write_tsv(paste0(resultsBase, "input_counts.txt"))
Holger Brandl's avatar
Holger Brandl committed
119
120

#' Structure of input matrix was
121
glimpse(countData, width = 60)
Holger Brandl's avatar
Holger Brandl committed
122

123
124
125
countMatrix = countData %>%
    column2rownames("ensembl_gene_id") %>%
    as.matrix()
Holger Brandl's avatar
Holger Brandl committed
126
127

# Expression Filtering
128
genesBefore = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
129
countMatrix %<>% filterByExpression(as.integer(opts$min_count))
130
genesAfter = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
131

Holger Brandl's avatar
Holger Brandl committed
132
133
#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`.

Holger Brandl's avatar
Holger Brandl committed
134
## todo this is required now , so why all the checking
135
if (! is.null(design_matrix_file)) {
136
    replicates2samples = read_tsv(design_matrix_file) #%>% select(replicate, sample)
137
}else {
herseman's avatar
herseman committed
138
    warning("DEPRECATED extracting conditions from replicates is bad practise and a design file should be provided instead")
139
    get_sample_from_replicate = function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[, 2]
Holger Brandl's avatar
Holger Brandl committed
140

herseman's avatar
herseman committed
141
    replicates2samples = data_frame(replicate = colnames(countMatrix)) %>% mutate(condition = get_sample_from_replicate(replicate)) # %>% distinct_all()
Holger Brandl's avatar
Holger Brandl committed
142
}
Holger Brandl's avatar
Holger Brandl committed
143

144
145
write_tsv(replicates2samples, "basic_design.txt")

Holger Brandl's avatar
Holger Brandl committed
146
# Define or load a contrasts matrix
147
if (! is.null(contrasts_file)) {
148
    contrasts = read_tsv(contrasts_file)
149
150
}else {
    assert(! is.null(replicates2samples), "Can not auto-create contrasts without design matrix") ## cant work yet because sample column with have random name
herseman's avatar
herseman committed
151
152
    contrasts = distinct_all(replicates2samples, condition) %>%
        select(condition) %>%
153
        merge(., ., suffixes = c("_1", "_2"), by = NULL) %>%
herseman's avatar
herseman committed
154
        filter(ac(condition_1) > ac(condition_2)) %>%
155
    #        filter(ac(condition_1)!=ac(condition_2)) %>%
Holger Brandl's avatar
Holger Brandl committed
156
        fac2char()
Holger Brandl's avatar
Holger Brandl committed
157
158

    write_tsv(contrasts, paste(resultsBase, "contrasts.txt"))
Holger Brandl's avatar
Holger Brandl committed
159
160
}

Holger Brandl's avatar
Holger Brandl committed
161
162

#' The used design matrix is
Holger Brandl's avatar
Holger Brandl committed
163
datatable(replicates2samples)
Holger Brandl's avatar
Holger Brandl committed
164

Holger Brandl's avatar
Holger Brandl committed
165
166
167
168
169
#' The used design formula is.
designFormula

#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)

Holger Brandl's avatar
Holger Brandl committed
170
#' The contrasts of interest are
Holger Brandl's avatar
Holger Brandl committed
171
172
173
174
175
176
contrasts %>% kable()

## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
177
178
#dds = makeExampleDESeqDataSet()
#dds = estimateSizeFactors( dds )
Holger Brandl's avatar
Holger Brandl committed
179
180
181
#sizeFactors(dds)

## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
Holger Brandl's avatar
Holger Brandl committed
182
183

## old non-mf approach
184
#exDesign = data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
Holger Brandl's avatar
Holger Brandl committed
185
186

## new mf-approach
187
188
exDesign = data_frame(replicate = colnames(countMatrix)) %>%
    mutate(col_index = row_number()) %>%
189
    right_join(replicates2samples, by = "replicate") %>%
190
    arrange(col_index) #%>% transmute(condition=condition_2ndwt) %>% fac2char
Holger Brandl's avatar
Holger Brandl committed
191

192
assert(! any(is.na(exDesign$col_index)), "assay design file is not compatible to count matrix column names")
193

194
195
196
197
198
exDesign %>% ggplot(aes(condition)) +
    geom_bar() +
    ylab("# samples") +
    coord_flip() +
    ggtitle("samples per condition")
199

Holger Brandl's avatar
Holger Brandl committed
200
## infer the sample to be used from the formula
201
#designFormula = "condition_2ndwt + batch"
Holger Brandl's avatar
Holger Brandl committed
202
203


204
#exDesign %<>% rename(sample, condition_2ndwt)
Holger Brandl's avatar
Holger Brandl committed
205

206
#exDesign = replicates2samples %>%  arrange(colnames(countMatrix))
Holger Brandl's avatar
Holger Brandl committed
207
208

# valiadate that contrasts model is compatible with data
209
210
assert(setequal(names(contrasts), c("condition_1", "condition_2")), "contrasts matrix has incorrect column headers. Expected: condition_1, condition_2")

211
212
213
214
215
216
designSamples = as_df(exDesign)[, contrastAttribute] %>%
    unique %>%
    sort
contrastSamples = contrasts %>% gather %$% value %>%
    unique %>%
    sort
217

218
219
220
if (! all(contrastSamples %in% designSamples)) {
    warning("design   : ", paste(designSamples, collapse = ", "))
    warning("contrasts: ", paste(contrastSamples, collapse = ", "))
Holger Brandl's avatar
Holger Brandl committed
221
    stop("experiment design is not consistent with contrasts")
Holger Brandl's avatar
Holger Brandl committed
222
223
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
Holger Brandl's avatar
Holger Brandl committed
224

Holger Brandl's avatar
Holger Brandl committed
225
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
226
227
228
dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds = estimateSizeFactors(dds)
#dds = estimateDispersions(dds)
229
230
231
# dds = DESeq(dds, parallel = T)

dds = DESeq(dds, parallel = T, betaPrior = use_betaPrior)
Holger Brandl's avatar
Holger Brandl committed
232

233

Holger Brandl's avatar
Holger Brandl committed
234
235


Holger Brandl's avatar
Holger Brandl committed
236
237
238
########################################################################################################################
#' ## Quality Control

Holger Brandl's avatar
Holger Brandl committed
239
240
#' ### Size Factors

Holger Brandl's avatar
Holger Brandl committed
241
242
243
244
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
#'
#' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.

Holger Brandl's avatar
Holger Brandl committed
245
sizeFactors(dds) %>%
Holger Brandl's avatar
Holger Brandl committed
246
247
#    set_names(colnames(countMatrix)) %>%
    vec_as_df("replicate", "size_factor") %>%
248
249
    ggplot(aes(replicate, size_factor)) +
    geom_bar(stat = "identity") +
Holger Brandl's avatar
Holger Brandl committed
250
    ggtitle("Size Factors estimated by DESeq") +
251
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
252
253
254
255
256

# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.



Holger Brandl's avatar
Holger Brandl committed
257
#' ### Global Dispersion Model
Holger Brandl's avatar
Holger Brandl committed
258
259
260
261
262
263

#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
Holger Brandl's avatar
Holger Brandl committed
264
plotDispEsts(dds, main = "Dispersion Model")
265

Holger Brandl's avatar
Holger Brandl committed
266
267


Holger Brandl's avatar
Holger Brandl committed
268
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
269
#' ## PCA and Clustering
Holger Brandl's avatar
Holger Brandl committed
270
271

#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
herseman's avatar
herseman committed
272
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment's design?
Holger Brandl's avatar
Holger Brandl committed
273
274
275
276
277
278
279
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.


#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.

# Regularized log transformation for clustering/heatmaps, etc
280
rld = rlogTransformation(dds)
281
# vst = vst(dds) ## vst is supposed to be much 1k x faster
Holger Brandl's avatar
Holger Brandl committed
282

Holger Brandl's avatar
Holger Brandl committed
283
284
# extract rlogTransformation normalized count data
norm_counts <- assay(rld)
Holger Brandl's avatar
Holger Brandl committed
285
286


287
288
289
load_pack(limma)
load_pack(stringi)
load_pack(htmltools)
290
291
load_pack(plotly)
# install.packages("htmlwidgets")
292
293
294
295
296
297
298
299

#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
300
effects = designFormula %>% str_split("[+]") %>% unlist
301
302
303
304
305
306
307
308
309


htmltools::tagList(lapply(effects, function(batchFactor) {
    # batchFactor=effects[1]
    condition = pull(exDesign, batchFactor)
    names(condition) = exDesign$replicate
    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
    {
        makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
310
        guides(color = guide_legend(title = batchFactor))
311
312
313
314
315
    } %>% ggplotly()
}))


# extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame
Lena Hersemann's avatar
Lena Hersemann committed
316
317
318
319
320
adjvar <- designFormula %>%
    str_split("[+]") %>%
    unlist %>%
    stri_subset_regex("condition", negate = T)
adjvar_data <- data.frame(exDesign[, which(names(exDesign) %in% head(adjvar))])
321
322
323



324
325
treatDesign = exDesign %$% model.matrix(~ condition)

Lena Hersemann's avatar
Lena Hersemann committed
326
if (length(adjvar_data) == 1) {
327
    corr_data <- limma::removeBatchEffect(norm_counts, batch = adjvar_data[, 1], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
328
}else if (length(adjvar_data) == 2) {
329
    corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1], batch2 = adjvar_data[, 2], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
330
}else if (length(adjvar_data) > 2) {
331
    print("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.")
Lena Hersemann's avatar
Lena Hersemann committed
332
}else {
333
    print("The design formula does not contain an adjustment variable for batch effect correction")
334
335
336
337
338
339
340
341
342
343
344
}


# for (i in colnames(adjvar_data)) {
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
345
346
347
if (exists("corr_data")) {
    htmltools::tagList(lapply(effects, function(batchFactor) {
        # batchFactor=effects[1]
348
        # batchFactor="condition"
Lena Hersemann's avatar
Lena Hersemann committed
349
350
351
352
        condition = pull(exDesign, batchFactor)
        names(condition) = exDesign$replicate
        # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
        {
353
            makePcaPlot(t(corr_data), color_by = condition, scale = F, title = "DESeq normalized count data WITH batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
354
355
356
357
358
359
            guides(color = guide_legend(title = batchFactor))
        } %>% ggplotly()
    }))
}else {
    print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
360
361


Holger Brandl's avatar
Holger Brandl committed
362
363
364
365
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis

366
367
368
369
370
#distsRL = dist(t(assay(rld)))
#hc = hclust(distsRL)
#distMatrix = as.matrix(distsRL)
distMatrix = as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) = colnames(distMatrix) = with(exDesign(dds), paste(condition, sep=" : "))
Holger Brandl's avatar
Holger Brandl committed
371

Holger Brandl's avatar
Holger Brandl committed
372
countData %>% distinct(ensembl_gene_id)
373
labelcntData = countData %>%
374
375
376
377
378
    distinct_all(ensembl_gene_id) %>%
    column2rownames("ensembl_gene_id") %>%
    data.matrix() %>%
    round() %>%
    data.frame()
Holger Brandl's avatar
Holger Brandl committed
379

380
381
382
# load_pack(gplots)
# heatmap.2(distMatrix, labRow = colnames(labelcntData), labCol = colnames(labelcntData),
# symm = TRUE, trace = "none",
383
#key=F, col=colorpanel(100, "black", "white"),
384
# margin = c(8, 8), main = "Sample Distance Matrix")
Holger Brandl's avatar
Holger Brandl committed
385

386
387
rownames(distMatrix) = colnames(labelcntData)
colnames(distMatrix) = colnames(labelcntData)
Holger Brandl's avatar
Holger Brandl committed
388
389
390
391

# load_pack(d3heatmap)
load_pack(heatmaply)

Holger Brandl's avatar
Holger Brandl committed
392
393
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
Lena Hersemann's avatar
Lena Hersemann committed
394
print("DESeq normalized count data WITHOUT batch correction")
Holger Brandl's avatar
Holger Brandl committed
395
396
# distMatrix %>% d3heatmap(xaxis_height = 1)
distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Holger Brandl's avatar
Holger Brandl committed
397

Holger Brandl's avatar
Holger Brandl committed
398

Lena Hersemann's avatar
Lena Hersemann committed
399
400
401
if (exists("corr_data")) {
    print("DESeq normalized count data WITH batch correction")
    corr_distMatrix = as.matrix(dist(t(corr_data)))
Holger Brandl's avatar
Holger Brandl committed
402
403
    # corr_distMatrix %>% d3heatmap(xaxis_height = 1)
    corr_distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Lena Hersemann's avatar
Lena Hersemann committed
404
405
406
}else {
    # print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
Holger Brandl's avatar
Holger Brandl committed
407

408
409
## see http://www.bioconductor.org/help/workflows/rnaseqGene/
## DEBUG-START
410
# if (F) {
Holger Brandl's avatar
Holger Brandl committed
411
# assay(rld) %>% head
412
413
load_pack("genefilter")
load_pack("pheatmap")
414
415
mostSIg <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[mostSIg,]
416
417
418
419
420
421
422
mat <- mat - rowMeans(mat)

# df <- as.data.frame(colData(rld)[,c("cell","dex")])
#pheatmap(mat, annotation_col=df)
# pheatmap(mat)
pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index))
# }
423
424
## DEBUG-END

Holger Brandl's avatar
Holger Brandl committed
425

Holger Brandl's avatar
Holger Brandl committed
426
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
427
#' ## Differential Expression Test
Holger Brandl's avatar
Holger Brandl committed
428
429
430
431
432
433
434
435
436
437
438
439
#'
#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
#' DESeq uses a model based on the negative binomial distribution and offers the following features:
#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly.
#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.
#'
#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details.
#'
#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

Holger Brandl's avatar
Holger Brandl committed
440
441


Holger Brandl's avatar
Holger Brandl committed
442
# Run Deseq Test
443
444
445
#dds = DESeq(dds, fitType='local', betaPrior=FALSE)
#dds = DESeq(dds, fitType='local')
#dds = DESeq(dds)
Holger Brandl's avatar
Holger Brandl committed
446

447
#res = results(dds)
Holger Brandl's avatar
Holger Brandl committed
448
449
450
#resultsNames(dds)

#' Model Overview:
Holger Brandl's avatar
Holger Brandl committed
451

Holger Brandl's avatar
Holger Brandl committed
452
453
454
455
summary(results(dds))


## extract all de-sets according to our contrasts model
456
457
# install_package("plyr")
# load_pack(IHW) --> todo reenable once installation on macos is fixed (see https://support.bioconductor.org/p/92731/)
458

Holger Brandl's avatar
Holger Brandl committed
459
460
461
462
# low count filtering https://support.bioconductor.org/p/65256/
# independent filtering occurs within results() to save you from multiple test correction on genes with no power
# (see ?results and the vignette section about independent filtering, or the paper). T

463
#deResults = alply(contrasts, 1,  splat(function(condition_1, condition_2){
herseman's avatar
herseman committed
464
deResults = plyr::alply(contrasts, 1, plyr::splat(function(condition_1, condition_2){
Lena Hersemann's avatar
Lena Hersemann committed
465
    #DEBUG condition_1=as_df(contrasts)[1,1]; condition_2=as_df(contrasts)[1,2]
466
    # results(dds, contrast=c(contrastAttribute, condition_1, condition_2)) %>%
467
468
    # results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff, filterFun = ihw) %>%
    results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff) %>%
469
470
        as_tibble() %>%
        tibble::rownames_to_column("ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
471
    ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
472
        rename(c1_over_c2_logfc = log2FoldChange) %>%
herseman's avatar
herseman committed
473
        mutate(condition_1 = condition_1, condition_2 = condition_2) #%>% head %>% print_head
474
})) %>% bind_rows
Holger Brandl's avatar
Holger Brandl committed
475

Holger Brandl's avatar
Holger Brandl committed
476

477
478
# deResults = adply(contrasts, 1,  splat(function(condition_1, condition_2){
#  echo('samples', condition_1, condition_2)
Holger Brandl's avatar
Holger Brandl committed
479
480
481
#   return(data.frame())
# }))

482
deResults %>% ggplot(aes(c1_over_c2_logfc)) +
483
    geom_histogram(binwidth = 0.1) +
herseman's avatar
herseman committed
484
    facet_grid(condition_1 ~ condition_2) +
485
486
    geom_vline(xintercept = 0, color = "blue") +
    xlim(- 3, 3) +
487
    ggtitle("condition_1 over condition_2 logFC ")
Holger Brandl's avatar
Holger Brandl committed
488
489
490
491
492


#deResults %>% ggplot(aes(pvalue)) +
#    geom_histogram() +
#    xlim(0,1) +
493
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
494
495
496
497
498
#    ggtitle("p-value distributions") #+ scale_x_log10()
#
#deResults %>% ggplot(aes(padj)) +
#    geom_histogram() +
##    xlim(0,1) +
499
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
500
501
502
503
504
#    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


# report hit criterion
#+ results='asis'
505
if (! is.null(qcutoff)) {
Holger Brandl's avatar
Holger Brandl committed
506
    echo("Using q-value cutoff of", qcutoff)
507
508
    deResults %<>% transform(is_hit = padj <= qcutoff)
}else {
Holger Brandl's avatar
Holger Brandl committed
509
    echo("Using p-value cutoff of", pcutoff)
510
    deResults %<>% transform(is_hit = pvalue <= pcutoff)
Holger Brandl's avatar
Holger Brandl committed
511
512
513
514
}
#deResults %<>% mutate(is_hit=pvalue<0.05)


515
# filter(deResults, ensembl_gene_id=="ENSMUSG00000034957")
Holger Brandl's avatar
Holger Brandl committed
516

517
deResults %<>% mutate(c1_overex = c1_over_c2_logfc > 0)
518
519
520


normCounts = counts(dds, normalized = TRUE) %>%
Holger Brandl's avatar
Holger Brandl committed
521
#    set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
522
523
524
    set_names(colnames(countMatrix)) %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df
Holger Brandl's avatar
Holger Brandl committed
525
526


527
normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_counts_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
528
529
530
531
532

## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
533
#' ### MA and Volcano plots
Holger Brandl's avatar
Holger Brandl committed
534

herseman's avatar
herseman committed
535
#' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis ("M" for minus, because a log ratio is equal to log minus log, and "A" for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True).
Holger Brandl's avatar
Holger Brandl committed
536
537
538
539
540
541
#' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call.

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

# Calculation of the base means per condition
Holger Brandl's avatar
Holger Brandl committed
542
543
544
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html


545
meanNormCounts = counts(dds, normalized = TRUE) %>%
546
547
    as_df %>%
    rownames_to_column("ensembl_gene_id") %>%
548
549
550
551
552
553
    tbl_df %>%
    gather(replicate, norm_count, - ensembl_gene_id) %>%
    inner_join(exDesign) %>%
    group_by(ensembl_gene_id, condition) %>%
    summarize(mean_norm_count = mean(norm_count))

Holger Brandl's avatar
Holger Brandl committed
554

herseman's avatar
herseman committed
555
## add base means to diff??ex summary
556
#numResultsBeforeMerge = nrow(deResults)
557
558
deResults = meanNormCounts %>%
    # gather(condition, norm_count, - ensembl_gene_id) %>%
559
    inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2")) %>%
560
#    filter(ac(condition_1)<ac(condition_2)) %>%
561
# add diffex status
562
    inner_join(deResults)
Holger Brandl's avatar
Holger Brandl committed
563
564

#+ fig.width=16, fig.height=14
565
deResults %>% ggplot(aes(0.5 * log2(mean_norm_count_1 * mean_norm_count_2), log2(mean_norm_count_2 / mean_norm_count_1), color = pvalue < 0.05)) +
566
567
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = 0, color = "red") +
herseman's avatar
herseman committed
568
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
569
570
571
572
573
574
575
576
577
578
579


#deResults %$% pvalue %>%  log10() %>% quantile(0.05, na.rm=T)

#' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test).
#' This scatter plot is used to quickly identify changes in large datasets composed of replicate data.
#' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis.
#' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center.
#' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top).
#' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29)

580
maxX = quantile(deResults$c1_over_c2_logfc, c(0.005, 0.99), na.rm = TRUE) %>%
581
582
583
    abs %>%
    max

Holger Brandl's avatar
Holger Brandl committed
584
##Volcano plots
585
hitCounts = filter(deResults, is_hit) %>%
586
    count(condition_1, condition_2, c1_overex) %>%
587
    rename(hits = n) %>%
588
589
590
591
## complement hit counts with directed contrasts without any diffex genes
    right_join(merge(mutate(contrasts), data_frame(c1_overex = c(T, F)), by = NULL)) %>%
    mutate(hits = if_else(is.na(hits), as.integer(0), hits)) %>%
    merge(data.frame(c1_overex = c(T, F), x_pos = c(maxX - maxX * 0.1, - maxX + maxX * 0.1)))
Holger Brandl's avatar
Holger Brandl committed
592

593
hitCounts %<>% mutate(y_pos = quantile(- log10(deResults$pvalue), 0.987, na.rm = TRUE))
Holger Brandl's avatar
Holger Brandl committed
594

Holger Brandl's avatar
Holger Brandl committed
595
#+ fig.width=16, fig.height=14
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611


## todo remove; part of commons v1.41
trim_outliers <- function(values, probs=c(0.05, 0.95)){
    values = deResults$pvalue
    stopifnot(length(probs) == 2)
    quantiles = quantile(values, probs, na.rm = TRUE)

    pmax(quantiles[1], pmin(quantiles[2], values))
}



deResults %>%
    mutate(p_trimmed = trim_outliers(pvalue, c(0.01, 1))) %>%
    filter(! is.na(is_hit)) %>%
612
    ggplot(aes(c1_over_c2_logfc, - log10(p_trimmed), color = is_hit)) +
613
# geom_jitter(alpha = 0.1, position = position_jitter(height = 0.2)) +
Holger Brandl's avatar
Holger Brandl committed
614
    geom_point(alpha = 0.1) +
Holger Brandl's avatar
Holger Brandl committed
615
#    theme_bw() +
616
617
    xlim(- maxX, maxX) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
Holger Brandl's avatar
Holger Brandl committed
618
#    ggtitle("Insm1/Ctrl") +
619
620
621
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
## tweak axis labels
Holger Brandl's avatar
Holger Brandl committed
622
    xlab(expression(log[2]("x/y"))) +
623
    ylab(expression(- log[10]("p value"))) +
Holger Brandl's avatar
Holger Brandl committed
624
625
#  ylim(0,50) +

626
## add hit couts
627
    geom_text(aes(label = hits, x = x_pos, y = y_pos), color = "red", size = 8, data = hitCounts) +
herseman's avatar
herseman committed
628
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
629
630


Holger Brandl's avatar
Holger Brandl committed
631
########################################################################################################################
632
# Count Table Exports
Holger Brandl's avatar
Holger Brandl committed
633
634


Holger Brandl's avatar
Holger Brandl committed
635
## optionally install bioconducor package
Holger Brandl's avatar
Holger Brandl committed
636
install_package("biomaRt")
Holger Brandl's avatar
Holger Brandl committed
637

Holger Brandl's avatar
Holger Brandl committed
638

639

Holger Brandl's avatar
Holger Brandl committed
640
# Load transcriptome annotations needed for results annotation
641

642
643
if (is.null(gene_info_file)) {
    ensembl_dataset = if (! is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl")  else guess_mart(countData$ensembl_gene_id)
644

645
646
647
648
649
650
651
652
653
654
655
656
657
658
    geneInfo = quote({
        ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))??
        #  mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
        ## todo fix this https://support.bioconductor.org/p/74322/
        # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
        # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
        mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)


        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart = mart) %>%
            tbl_df
    }) %>% cache_it("geneInfo")
} else {
659
    geneInfo = read_tsv(gene_info_file)
660
661
662
    colnames(geneInfo)[1] = "ensembl_gene_id"
}

Holger Brandl's avatar
Holger Brandl committed
663

664
665
geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position)
geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description)
herseman's avatar
herseman committed
666
667


Holger Brandl's avatar
Holger Brandl committed
668
## save as reference for downstream analysis
669
write_tsv(geneInfo, path = paste0(resultsBase, "geneInfo.txt"))
Holger Brandl's avatar
Holger Brandl committed
670
671


672
# Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
673
674
fpkms = countMatrix %>%
    as.data.frame %>%
Holger Brandl's avatar
Holger Brandl committed
675
    rownames_to_column("ensembl_gene_id") %>%
676
677
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
678
    left_join(geneLengths, by = "ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
679
680
## calculate normalized counts
    group_by(replicate) %>%
681
    mutate(lib_size = sum(num_tags), fpkm = ((1E9 / lib_size) * (num_tags / gene_length)) %>% round(digits = 3)) %>%
Holger Brandl's avatar
Holger Brandl committed
682
683
684
685
686
687
    ungroup()

#fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)

fpkms %>%
    transmute(ensembl_gene_id, replicate, fpkm) %>%
Holger Brandl's avatar
Holger Brandl committed
688
    spread(replicate, fpkm) %>%
689
690
691
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
692

693
# Aggregate FPKMs by condition and export into tabele
694
fpkms %>%
695
    inner_join(exDesign, by = "replicate") %>%
696
# group_by_(.dots = c(contrastAttribute, "ensembl_gene_id")) %>%
697
698
#     group_by(!!contrastAttribute, ensembl_gene_id) %>%
    group_by(condition, ensembl_gene_id) %>%
699
700
701
702
703
704
    summarize(mean_fpkm = mean(fpkm)) %>%
    ungroup() %>%
    spread(condition, mean_fpkm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_condition.txt"))
Holger Brandl's avatar
Holger Brandl committed
705

706
# [FPKMs as Matrix](`r paste0(resultsBase, "fpkms_by_condition.txt")`)
Holger Brandl's avatar
Holger Brandl committed
707

Holger Brandl's avatar
Holger Brandl committed
708

709
# Export counts per replicate as TPMs (transcript per million)
710
# calculation of tpms and generation of a wide table including gene description and the external gene name
Holger Brandl's avatar
Holger Brandl committed
711

712
713
714
715
716
717
718
719
720
721
tpms = countMatrix %>%
    as.data.frame %>%
    rownames_to_column("ensembl_gene_id") %>%
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
    left_join(geneLengths) %>%
    mutate(length_norm_count = num_tags / gene_length) %>%
    group_by(replicate) %>%
    mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
    ungroup()
Holger Brandl's avatar
Holger Brandl committed
722

723
724
725
726
727
728
tpms %>%
    select(ensembl_gene_id, replicate, tpm) %>%
    spread(replicate, tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "tpms_by_replicate.txt"))
herseman's avatar
herseman committed
729

730
# Also average TPMs per condition and export
731
732
733
734
735
736
737
738
739
tpms %>%
    inner_join(exDesign, by = "replicate") %>%
    group_by(condition, ensembl_gene_id) %>%
# group_by(condition, ensembl_gene_id) %>%
    summarize(mean_tpm = mean(tpm)) %>%
# select(ensembl_gene_id, replicate, gene_name, gene_description, mean_tpm) %>%
    spread(condition, mean_tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
Holger Brandl's avatar
Holger Brandl committed
740
    write_tsv(paste0(resultsBase, "tpms_by_condition.txt"))
herseman's avatar
herseman committed
741
742
743
744
745
746


#-----------------------------



Holger Brandl's avatar
Holger Brandl committed
747
# Export the complete dataset for later analysis
Holger Brandl's avatar
Holger Brandl committed
748
deAnnot = deResults %>% left_join(geneInfo)
749
750
751
752
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
Holger Brandl's avatar
Holger Brandl committed
753

Holger Brandl's avatar
Holger Brandl committed
754

755
write_tsv(deAnnot, path = paste0(resultsBase, "de_results.txt"))
756
#deAnnot = read_tsv(paste0(resultsBase, "de_results.txt"))
757
#  [de_results.txt](`r paste0(resultsBase, "de_results.txt")`)
Holger Brandl's avatar
Holger Brandl committed
758
759

## also export results which we'll always need for pathways overlays
760
761
762
deAnnot %>%
    transmute(
    ensembl_gene_id,
herseman's avatar
herseman committed
763
    contrast = paste(condition_1, "vs", condition_2),
764
    plot_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)
765
766
    ) %>%
    spread(contrast, plot_score) %>%
Holger Brandl's avatar
Holger Brandl committed
767
    write_tsv(paste0(resultsBase, "plot_score_matrix.txt"))
Holger Brandl's avatar
Holger Brandl committed
768
769
770
771

#deAnnot %>% filter(ensembl_gene_id=="FBgn0000015")


772
# subset data according to contrasts
Holger Brandl's avatar
Holger Brandl committed
773
if (sub_data) {
774
775
776
    dir.create("de_results")
    comp <- apply(contrasts, 1, paste0, collapse = "_vs_")
    lapply(comp, function(x) {
777
778
779
780
781
        deAnnot %>%
            mutate(contrast = paste(condition_1, condition_2, sep = "_vs_")) %>%
            filter(grepl(x, contrast)) %>%
            select(- contrast) %>%
            write_tsv(paste("de_results/", x, ".txt", sep = ""))
782
    })
783
}
784
785


Holger Brandl's avatar
Holger Brandl committed
786
787
788
789
790
791
## todo understand purpose and effeect of indpendentFiltering (see https://support.bioconductor.org/p/57128/)
#deResults %>% count(is_hit)
#deAnnot %>% count(is_hit)
#deAnnot %>% count(is.na(padj))


Holger Brandl's avatar
Holger Brandl committed
792

Lena Hersemann's avatar
Lena Hersemann committed
793
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
794
#' ### Count Outlier Analysis
Lena Hersemann's avatar
Lena Hersemann committed
795
796

#' For how many genes did Deseq did not assign a p-value (due to outliers; see section 1.5.3 os [DEseq manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-outlier-detection))
Holger Brandl's avatar
Holger Brandl committed
797
798


Holger Brandl's avatar
Holger Brandl committed
799
800
801
802
803
804
805
806
807
808

# summary(results(dds, cooksCutoff=FALSE))

#' See "3.6 Approach to count outliers" in the [DESEQ2 manual](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details about outlier handling
#'
#' Def: Cook's distance is a measure of how much a single sample is influencing the fitted coefficients for a gene, and a large value of Cook's distance is intended to indicate an outlier count
# par(mar=c(8,5,2,2))
# boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)


Holger Brandl's avatar
Holger Brandl committed
809
810
deAnnot %>%
    filter(is.na(pvalue)) %>%
herseman's avatar
herseman committed
811
    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
812
    count(contrast) %>%
Holger Brandl's avatar
Holger Brandl committed
813
    n_as("genes_with_NA_pvalue") %>%
814
    kable(caption = "Failed comparsions by sample")
Holger Brandl's avatar
Holger Brandl committed
815
816


Lena Hersemann's avatar
Lena Hersemann committed
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
if ((deAnnot %>% filter(is.na(pvalue)) %>% nrow) > 0) {

    na_counts = norm_counts %>%
        as.data.frame() %>%
        rownames_to_column("ensembl_gene_id") %>%
        tbl_df %>%
        gather(replicate, norm_count, - ensembl_gene_id) %>%
        right_join(deAnnot %>%
            filter(is.na(pvalue)) %>%
            select(ensembl_gene_id, condition_1, condition_2)) %>%
    # glimpse
    # filter(ensembl_gene_id=="ENSMUSG00000000120")
        left_join(select(exDesign, replicate, condition)) %>%
        filter(condition_1 == condition | condition_2 == condition)

    na_counts %>%
        filter(ensembl_gene_id %in% unique(ensembl_gene_id)[1 : 6]) %>%
        ggplot(aes(condition, norm_count, label = replicate)) +
        geom_point() +
        geom_text() +
        facet_wrap(~ ensembl_gene_id, scales = "free_y") +
        ggtitle("exmple gene counts by replicate")

    na_counts %>%
        group_by(ensembl_gene_id) %>%
        filter(norm_count == max(norm_count)) %>%
        ggplot(aes(replicate)) +
        geom_bar() +
        coord_flip() +
        facet_grid(condition ~ ., scales = "free_y") +
        ylab("Genes without p-value in which sample has largest count") +
        ggtitle("Which replicates are causing most of the NAs")
}

########################################################################################################################
#' ## Hits Summary

Holger Brandl's avatar
Holger Brandl committed
854
# Extract hits from deseq results
Lena Hersemann's avatar
Lena Hersemann committed
855
856
degs = deAnnot %>% filter(is_hit)

857
# ggplot(degs, aes(paste(condition_1, "vs",  condition_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
Lena Hersemann's avatar
Lena Hersemann committed
858

Holger Brandl's avatar
Holger Brandl committed
859
# degs %>%
860
#     ggplot(aes(paste(condition_1, "vs",  condition_2), fill=c1_overex)) +
Holger Brandl's avatar
Holger Brandl committed
861
862
863
864
865
866
867
#     geom_bar(position="dodge") +
#     xlab(NULL) + ylab("# DBGs") +
#     ggtitle("DBG count summary by direction of expression") +
#     coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
868
#degsAnnot = degs %>%
Holger Brandl's avatar
Holger Brandl committed
869
870
871
872
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write_tsv(paste0(resultsBase, "diffex_genes.txt"))
873
#degs = read_tsv(paste0(resultsBase, "diffex_genes.txt"))
Holger Brandl's avatar
Holger Brandl committed
874
875
876
877
878
879
#' [Differentially Expressed Genes](`r paste0(resultsBase, "diffbind_genes.txt")`)


#unloadNamespace("dplyr"); require(dplyr)

## export slim hit list for downstream enrichment analysis
880
degs %>%
herseman's avatar
herseman committed
881
    transmute(ensembl_gene_id, contrast = paste(condition_1, "vs", condition_2)) %>%
882
    write_tsv(paste0(resultsBase, "degs_by_contrast.txt"))
Holger Brandl's avatar
Holger Brandl committed
883

884
degs %>%
885
    transmute(ensembl_gene_id, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>%
886
887
    write_tsv(paste0(resultsBase, "degs_by_contrast_directed.txt"))

Holger Brandl's avatar
Holger Brandl committed
888
## also export gsea inputs
889
890
891
892
893
894
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue)) %>% #count(is.na(rank_score))
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__undirected.txt"))
Holger Brandl's avatar
Holger Brandl committed
895

896
897
898
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
899
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)) %>%
900
901
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__directed.txt"))
Holger Brandl's avatar
Holger Brandl committed
902
903
904
905
906
907
908
909
910


# ## render interactive hit browser
# #+results='asis'
# degs %>%
#     left_join(geneInfo) %>%
#     mutate(
#         igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
#     ) %>%
911
#     select(c1_over_c2_logfc, pvalue, ensembl_gene_id, condition_1, condition_2, external_gene_name, description, igv) %>%
Holger Brandl's avatar
Holger Brandl committed
912
913
914
# #    kable("html", table.attr = "class='dtable'", escape=F)
#     datatable(escape=F)

Lena Hersemann's avatar
Lena Hersemann committed
915
#+ fig.height=nrow(contrasts)/2+2, eval=nrow(degs)>0
916
917
#ggplot(degs, aes(paste(condition_1, "vs",  condition_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) +
918
919
920
921
922
    geom_bar() +
    xlab(NULL) +
    ylab("# DGEs") +
    ggtitle("DEGs by contrast") +
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
923

924
#with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable()
925
degs %>%
926
    count(condition_1, condition_2, c1_overex) %>%
Holger Brandl's avatar
Holger Brandl committed
927
    mutate(c1_overex = ifelse(c1_overex, ("Up in condition_1"), "Down in condition_1")) %>%
928
    spread(c1_overex, n) %>%
929
    kable()
Holger Brandl's avatar
Holger Brandl committed
930
931
932
933
934
935
936


#'  DEGs (differentially expressed genes) are contained in the following table


#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
#+ results='asis'
Holger Brandl's avatar
Holger Brandl committed
937
938
# if (nrow(degs) < 3000) {
degs %>%
939
940
#inner_join(geneLoci) %>%
    mutate(
941
942
    ensembl = paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=", ensembl_gene_id, "'>", ensembl_gene_id, "</a>"),
    igv = paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name, ":", start_position, "-", end_position, "'>IGV</a>")
943
    ) %>%
944
    select(condition_1, condition_2, c1_over_c2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%
Holger Brandl's avatar
Holger Brandl committed
945
#  kable("html", table.attr = "class='dtable'", escape=F)