featcounts_deseq_mf.R 43.8 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env Rscript
Holger Brandl's avatar
Holger Brandl committed
2

Holger Brandl's avatar
Holger Brandl committed
3 4 5
#' # Differential Expression Analysis
#'
#' Contents
Holger Brandl's avatar
Holger Brandl committed
6
#'
Holger Brandl's avatar
Holger Brandl committed
7 8 9 10 11 12 13 14 15
#' - [Data Preparation](#data-preparation)
#' - [Quality Control](#quality-control)
#'     - [Size Factors](#size-factors)
#'     - [Global Dispersion Model](#global-dispersion-model)
#' - [PCA and Clustering](#pca-and-clustering)
#' - [Differential Expression Test](#differential-expression-test)
#'     - [MA and Volcano plots](#ma-and-volcano-plots)
#'     - [Count Outlier Analysis](#count-outlier-analysis)
#' - [Hits Summary](#hits-summary)
Holger Brandl's avatar
Holger Brandl committed
16
#' - [Exported Data](#exported-data)
Holger Brandl's avatar
Holger Brandl committed
17
#'
Holger Brandl's avatar
Holger Brandl committed
18

Holger Brandl's avatar
Holger Brandl committed
19
# to rebuild toc do knitr::spin("featcounts_deseq_mf.R", FALSE)
Holger Brandl's avatar
Holger Brandl committed
20

Holger Brandl's avatar
Holger Brandl committed
21 22 23
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
Holger Brandl's avatar
Holger Brandl committed
24

Holger Brandl's avatar
Holger Brandl committed
25

Holger Brandl's avatar
Holger Brandl committed
26
#+ include=FALSE
Holger Brandl's avatar
Holger Brandl committed
27 28
suppressMessages(require(docopt))

29
doc = '
Holger Brandl's avatar
Holger Brandl committed
30
Perform a differential gene expression analysis using deseq2
Holger Brandl's avatar
Holger Brandl committed
31 32 33
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>

Options:
Holger Brandl's avatar
Holger Brandl committed
34 35 36 37
--contrasts=<tab_delim_table>   Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>             Override q-value filter and filter by p-value instead
--min_count <min_count>         Minimal expression in any of the sample to be included in the final result list [default: 10]
38
--out <name_prefix>             Name to prefix all generated result files
herseman's avatar
herseman committed
39
--design <formula>              Design fomula for DeSeq with contrast attribute at the end [default: condition]
40
--lfc <lfc_cutoff>              Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0]
41
--ensembl_db <ensembl_db>       Ensebmbl db to be used. If not specified inferred from data. TODO implement NONE here!!
42
--gene_info <info_file>         Gene information file downloaded from the ensembl website if bioMart version is not present
43
--betaprior <boolean>           Apply betaprior when running DESeq2 [default: TRUE]
44
--sub_data <boolean>            Subset de_results by contrasts and store the data in a subfolder [default: FALSE]
Holger Brandl's avatar
Holger Brandl committed
45 46
'

47
#commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt")
48
#commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts.txt", "--sub_data", "TRUE", "../alignments/star_counts_matrix.txt", "../basic_design.txt")
herseman's avatar
herseman committed
49

50
opts = docopt(doc, commandArgs(TRUE))
Holger Brandl's avatar
Holger Brandl committed
51 52


Holger Brandl's avatar
Holger Brandl committed
53 54
## load some packages first because of name-space order
library(knitr)
55 56
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
Holger Brandl's avatar
Holger Brandl committed
57
library(DESeq2)
58
#load_pack(readr)
59
library(IHW)
Holger Brandl's avatar
Holger Brandl committed
60 61


62 63
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/ggplot_commons.R")
64

65 66 67
## also load common helper for expression data analysis
# source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R"))
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
Holger Brandl's avatar
Holger Brandl committed
68

Holger Brandl's avatar
Holger Brandl committed
69 70

## process input arguments
71 72 73
count_matrix_file = opts$count_matrix
design_matrix_file = opts$design_matrix
contrasts_file = opts$contrasts
74
use_betaPrior = as.logical(opts$betaprior)
75
sub_data = as.logical(opts$sub_data)
76 77
gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
Holger Brandl's avatar
Holger Brandl committed
78

79
resultsBase = if (!is.null(opts$out)) paste0(opts$out, ".") else ""
Holger Brandl's avatar
Holger Brandl committed
80

81 82 83 84 85
pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff)
if (is.numeric(pcutoff))opts$qcutoff = NULL

lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc)
Holger Brandl's avatar
Holger Brandl committed
86

Holger Brandl's avatar
Holger Brandl committed
87
## extract the sample for the design
88
designFormula = opts$design
Holger Brandl's avatar
Holger Brandl committed
89
## consider last element of design formula as sample attribute
90 91 92
contrastAttribute = str_split(designFormula, "[+]") %>%
    unlist() %>%
    tail(1)
93 94
# assert(designFormula == "condition" || ! is.null(design_matrix_file), "custom designs are not supported without a design matrix")
assert(str_detect(designFormula, ".*condition$"))
Holger Brandl's avatar
Holger Brandl committed
95

Holger Brandl's avatar
Holger Brandl committed
96

Holger Brandl's avatar
Holger Brandl committed
97 98

#' Run configuration was
99 100 101
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    kable()
Holger Brandl's avatar
Holger Brandl committed
102

103
vec_as_df(unlist(opts)) %>%
104 105
    filter(! str_detect(name, "^[<-]")) %>%
    write_tsv(".run_configuration.txt")
106

Holger Brandl's avatar
Holger Brandl committed
107 108 109 110 111 112 113 114 115
########################################################################################################################
#' ## Data Preparation

#' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input.
#' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene.
#' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability.

#' Working Directory: `r getwd()`

116
countData = read_tsv(count_matrix_file)
Holger Brandl's avatar
Holger Brandl committed
117 118

## save a backup into the current working directory as a reference
Holger Brandl's avatar
Holger Brandl committed
119
countData %>% write_tsv(paste0(resultsBase, "input_counts.txt"))
Holger Brandl's avatar
Holger Brandl committed
120 121

#' Structure of input matrix was
122
glimpse(countData, width = 60)
Holger Brandl's avatar
Holger Brandl committed
123

124 125 126
countMatrix = countData %>%
    column2rownames("ensembl_gene_id") %>%
    as.matrix()
Holger Brandl's avatar
Holger Brandl committed
127 128

# Expression Filtering
129
genesBefore = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
130
countMatrix %<>% filterByExpression(as.integer(opts$min_count))
131
genesAfter = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
132

Holger Brandl's avatar
Holger Brandl committed
133 134
#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`.

135
## todo this is required now , so why all the checking
136
if (! is.null(design_matrix_file)) {
137
    replicates2samples = read_tsv(design_matrix_file) #%>% select(replicate, sample)
138
}else {
herseman's avatar
herseman committed
139
    warning("DEPRECATED extracting conditions from replicates is bad practise and a design file should be provided instead")
140
    get_sample_from_replicate = function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[, 2]
Holger Brandl's avatar
Holger Brandl committed
141

herseman's avatar
herseman committed
142
    replicates2samples = data_frame(replicate = colnames(countMatrix)) %>% mutate(condition = get_sample_from_replicate(replicate)) # %>% distinct_all()
Holger Brandl's avatar
Holger Brandl committed
143
}
Holger Brandl's avatar
Holger Brandl committed
144

145 146
write_tsv(replicates2samples, "basic_design.txt")

Holger Brandl's avatar
Holger Brandl committed
147
# Define or load a contrasts matrix
148
if (! is.null(contrasts_file)) {
149
    contrasts = read_tsv(contrasts_file)
150 151
}else {
    assert(! is.null(replicates2samples), "Can not auto-create contrasts without design matrix") ## cant work yet because sample column with have random name
herseman's avatar
herseman committed
152 153
    contrasts = distinct_all(replicates2samples, condition) %>%
        select(condition) %>%
154
        merge(., ., suffixes = c("_1", "_2"), by = NULL) %>%
herseman's avatar
herseman committed
155
        filter(ac(condition_1) > ac(condition_2)) %>%
156
    #        filter(ac(condition_1)!=ac(condition_2)) %>%
Holger Brandl's avatar
Holger Brandl committed
157
        fac2char()
Holger Brandl's avatar
Holger Brandl committed
158 159

    write_tsv(contrasts, paste(resultsBase, "contrasts.txt"))
Holger Brandl's avatar
Holger Brandl committed
160 161
}

Holger Brandl's avatar
Holger Brandl committed
162 163

#' The used design matrix is
Holger Brandl's avatar
Holger Brandl committed
164
datatable(replicates2samples)
Holger Brandl's avatar
Holger Brandl committed
165

Holger Brandl's avatar
Holger Brandl committed
166 167 168 169 170
#' The used design formula is.
designFormula

#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)

Holger Brandl's avatar
Holger Brandl committed
171
#' The contrasts of interest are
Holger Brandl's avatar
Holger Brandl committed
172 173 174 175 176 177
contrasts %>% kable()

## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
178 179
#dds = makeExampleDESeqDataSet()
#dds = estimateSizeFactors( dds )
Holger Brandl's avatar
Holger Brandl committed
180 181 182
#sizeFactors(dds)

## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
Holger Brandl's avatar
Holger Brandl committed
183 184

## old non-mf approach
185
#exDesign = data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
Holger Brandl's avatar
Holger Brandl committed
186 187

## new mf-approach
188 189
exDesign = data_frame(replicate = colnames(countMatrix)) %>%
    mutate(col_index = row_number()) %>%
190
    right_join(replicates2samples, by = "replicate") %>%
191
    arrange(col_index) #%>% transmute(condition=condition_2ndwt) %>% fac2char
Holger Brandl's avatar
Holger Brandl committed
192

193
assert(! any(is.na(exDesign$col_index)), "assay design file is not compatible to count matrix column names")
194

195 196 197 198 199
exDesign %>% ggplot(aes(condition)) +
    geom_bar() +
    ylab("# samples") +
    coord_flip() +
    ggtitle("samples per condition")
200

Holger Brandl's avatar
Holger Brandl committed
201
## infer the sample to be used from the formula
202
#designFormula = "condition_2ndwt + batch"
Holger Brandl's avatar
Holger Brandl committed
203 204


205
#exDesign %<>% rename(sample, condition_2ndwt)
Holger Brandl's avatar
Holger Brandl committed
206

207
#exDesign = replicates2samples %>%  arrange(colnames(countMatrix))
Holger Brandl's avatar
Holger Brandl committed
208 209

# valiadate that contrasts model is compatible with data
210 211
assert(setequal(names(contrasts), c("condition_1", "condition_2")), "contrasts matrix has incorrect column headers. Expected: condition_1, condition_2")

212 213 214 215 216 217
designSamples = as_df(exDesign)[, contrastAttribute] %>%
    unique %>%
    sort
contrastSamples = contrasts %>% gather %$% value %>%
    unique %>%
    sort
218

219 220 221
if (! all(contrastSamples %in% designSamples)) {
    warning("design   : ", paste(designSamples, collapse = ", "))
    warning("contrasts: ", paste(contrastSamples, collapse = ", "))
222
    stop("experiment design is not consistent with contrasts")
Holger Brandl's avatar
Holger Brandl committed
223 224
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
Holger Brandl's avatar
Holger Brandl committed
225

Holger Brandl's avatar
Holger Brandl committed
226
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
227 228 229
dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds = estimateSizeFactors(dds)
#dds = estimateDispersions(dds)
230 231 232
# dds = DESeq(dds, parallel = T)

dds = DESeq(dds, parallel = T, betaPrior = use_betaPrior)
Holger Brandl's avatar
Holger Brandl committed
233

234

Holger Brandl's avatar
Holger Brandl committed
235 236


Holger Brandl's avatar
Holger Brandl committed
237 238 239
########################################################################################################################
#' ## Quality Control

240 241
#' ### Size Factors

Holger Brandl's avatar
Holger Brandl committed
242 243 244 245
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
#'
#' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.

246
sizeFactors(dds) %>%
Holger Brandl's avatar
Holger Brandl committed
247 248
#    set_names(colnames(countMatrix)) %>%
    vec_as_df("replicate", "size_factor") %>%
249 250
    ggplot(aes(replicate, size_factor)) +
    geom_bar(stat = "identity") +
Holger Brandl's avatar
Holger Brandl committed
251
    ggtitle("Size Factors estimated by DESeq") +
252
    coord_flip()
253 254 255 256 257

# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.



Holger Brandl's avatar
Holger Brandl committed
258
#' ### Global Dispersion Model
Holger Brandl's avatar
Holger Brandl committed
259 260 261 262 263 264

#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
Holger Brandl's avatar
Holger Brandl committed
265
plotDispEsts(dds, main = "Dispersion Model")
266

Holger Brandl's avatar
Holger Brandl committed
267 268


Holger Brandl's avatar
Holger Brandl committed
269
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
270
#' ## PCA and Clustering
Holger Brandl's avatar
Holger Brandl committed
271 272

#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
herseman's avatar
herseman committed
273
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment's design?
Holger Brandl's avatar
Holger Brandl committed
274 275 276 277 278 279 280
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.


#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.

# Regularized log transformation for clustering/heatmaps, etc
281
rld = rlogTransformation(dds)
282
# vst = vst(dds) ## vst is supposed to be much 1k x faster
Holger Brandl's avatar
Holger Brandl committed
283

Holger Brandl's avatar
Holger Brandl committed
284 285
# extract rlogTransformation normalized count data
norm_counts <- assay(rld)
Holger Brandl's avatar
Holger Brandl committed
286 287


288 289 290
load_pack(limma)
load_pack(stringi)
load_pack(htmltools)
291 292
load_pack(plotly)
# install.packages("htmlwidgets")
293 294 295 296 297 298 299 300

#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
301
effects = designFormula %>% str_split("[+]") %>% unlist
302 303 304 305 306 307 308 309 310


htmltools::tagList(lapply(effects, function(batchFactor) {
    # batchFactor=effects[1]
    condition = pull(exDesign, batchFactor)
    names(condition) = exDesign$replicate
    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
    {
        makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
311
        guides(color = guide_legend(title = batchFactor))
312 313 314 315 316
    } %>% ggplotly()
}))


# extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame
Lena Hersemann's avatar
Lena Hersemann committed
317 318 319 320 321
adjvar <- designFormula %>%
    str_split("[+]") %>%
    unlist %>%
    stri_subset_regex("condition", negate = T)
adjvar_data <- data.frame(exDesign[, which(names(exDesign) %in% head(adjvar))])
322 323 324



325 326
treatDesign = exDesign %$% model.matrix(~ condition)

Lena Hersemann's avatar
Lena Hersemann committed
327
if (length(adjvar_data) == 1) {
328
    corr_data <- limma::removeBatchEffect(norm_counts, batch = adjvar_data[, 1], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
329
}else if (length(adjvar_data) == 2) {
330
    corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1], batch2 = adjvar_data[, 2], design = treatDesign)
Lena Hersemann's avatar
Lena Hersemann committed
331
}else if (length(adjvar_data) > 2) {
332
    print("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.")
Lena Hersemann's avatar
Lena Hersemann committed
333
}else {
334
    print("The design formula does not contain an adjustment variable for batch effect correction")
335 336 337 338 339 340 341 342 343 344 345
}


# for (i in colnames(adjvar_data)) {
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
346 347 348
if (exists("corr_data")) {
    htmltools::tagList(lapply(effects, function(batchFactor) {
        # batchFactor=effects[1]
349
        # batchFactor="condition"
Lena Hersemann's avatar
Lena Hersemann committed
350 351 352 353
        condition = pull(exDesign, batchFactor)
        names(condition) = exDesign$replicate
        # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
        {
354
            makePcaPlot(t(corr_data), color_by = condition, scale = F, title = "DESeq normalized count data WITH batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
355 356 357 358 359 360
            guides(color = guide_legend(title = batchFactor))
        } %>% ggplotly()
    }))
}else {
    print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
361 362


Holger Brandl's avatar
Holger Brandl committed
363 364 365 366
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis

367 368 369 370 371
#distsRL = dist(t(assay(rld)))
#hc = hclust(distsRL)
#distMatrix = as.matrix(distsRL)
distMatrix = as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) = colnames(distMatrix) = with(exDesign(dds), paste(condition, sep=" : "))
Holger Brandl's avatar
Holger Brandl committed
372

Holger Brandl's avatar
Holger Brandl committed
373
countData %>% distinct(ensembl_gene_id)
374
labelcntData = countData %>%
375 376 377 378 379
    distinct_all(ensembl_gene_id) %>%
    column2rownames("ensembl_gene_id") %>%
    data.matrix() %>%
    round() %>%
    data.frame()
Holger Brandl's avatar
Holger Brandl committed
380

381 382 383
# load_pack(gplots)
# heatmap.2(distMatrix, labRow = colnames(labelcntData), labCol = colnames(labelcntData),
# symm = TRUE, trace = "none",
384
#key=F, col=colorpanel(100, "black", "white"),
385
# margin = c(8, 8), main = "Sample Distance Matrix")
Holger Brandl's avatar
Holger Brandl committed
386

387 388
rownames(distMatrix) = colnames(labelcntData)
colnames(distMatrix) = colnames(labelcntData)
Holger Brandl's avatar
Holger Brandl committed
389 390 391 392

# load_pack(d3heatmap)
load_pack(heatmaply)

Holger Brandl's avatar
Holger Brandl committed
393 394
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
Lena Hersemann's avatar
Lena Hersemann committed
395
print("DESeq normalized count data WITHOUT batch correction")
Holger Brandl's avatar
Holger Brandl committed
396 397
# distMatrix %>% d3heatmap(xaxis_height = 1)
distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Holger Brandl's avatar
Holger Brandl committed
398

Holger Brandl's avatar
Holger Brandl committed
399

Lena Hersemann's avatar
Lena Hersemann committed
400 401 402
if (exists("corr_data")) {
    print("DESeq normalized count data WITH batch correction")
    corr_distMatrix = as.matrix(dist(t(corr_data)))
Holger Brandl's avatar
Holger Brandl committed
403 404
    # corr_distMatrix %>% d3heatmap(xaxis_height = 1)
    corr_distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T))
Lena Hersemann's avatar
Lena Hersemann committed
405 406 407
}else {
    # print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
Holger Brandl's avatar
Holger Brandl committed
408

409 410
## see http://www.bioconductor.org/help/workflows/rnaseqGene/
## DEBUG-START
411
# if (F) {
Holger Brandl's avatar
Holger Brandl committed
412
# assay(rld) %>% head
413 414
load_pack("genefilter")
load_pack("pheatmap")
415 416
mostSIg <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[mostSIg,]
417 418 419 420 421 422 423
mat <- mat - rowMeans(mat)

# df <- as.data.frame(colData(rld)[,c("cell","dex")])
#pheatmap(mat, annotation_col=df)
# pheatmap(mat)
pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index))
# }
424 425
## DEBUG-END

Holger Brandl's avatar
Holger Brandl committed
426

Holger Brandl's avatar
Holger Brandl committed
427
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
428
#' ## Differential Expression Test
Holger Brandl's avatar
Holger Brandl committed
429 430 431 432 433 434 435 436 437 438 439 440
#'
#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
#' DESeq uses a model based on the negative binomial distribution and offers the following features:
#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly.
#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.
#'
#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details.
#'
#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

Holger Brandl's avatar
Holger Brandl committed
441 442


Holger Brandl's avatar
Holger Brandl committed
443
# Run Deseq Test
444 445 446
#dds = DESeq(dds, fitType='local', betaPrior=FALSE)
#dds = DESeq(dds, fitType='local')
#dds = DESeq(dds)
Holger Brandl's avatar
Holger Brandl committed
447

448
#res = results(dds)
Holger Brandl's avatar
Holger Brandl committed
449 450 451
#resultsNames(dds)

#' Model Overview:
Holger Brandl's avatar
Holger Brandl committed
452

Holger Brandl's avatar
Holger Brandl committed
453 454 455 456
summary(results(dds))


## extract all de-sets according to our contrasts model
457 458
# install_package("plyr")
# load_pack(IHW) --> todo reenable once installation on macos is fixed (see https://support.bioconductor.org/p/92731/)
459

Holger Brandl's avatar
Holger Brandl committed
460 461 462 463
# low count filtering https://support.bioconductor.org/p/65256/
# independent filtering occurs within results() to save you from multiple test correction on genes with no power
# (see ?results and the vignette section about independent filtering, or the paper). T

464
#deResults = alply(contrasts, 1,  splat(function(condition_1, condition_2){
herseman's avatar
herseman committed
465
deResults = plyr::alply(contrasts, 1, plyr::splat(function(condition_1, condition_2){
Lena Hersemann's avatar
Lena Hersemann committed
466
    #DEBUG condition_1=as_df(contrasts)[1,1]; condition_2=as_df(contrasts)[1,2]
467
    # results(dds, contrast=c(contrastAttribute, condition_1, condition_2)) %>%
468 469
    results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff, filterFun = ihw) %>%
    # results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff) %>%
470 471 472 473 474
        as.data.frame() %>%
        mutate(ensembl_gene_id = rownames(.)) %>%
        push_left("ensembl_gene_id") %>%
        # as_tibble() %>%
        # tibble::rownames_to_column("ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
475
    ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
476
        rename(c1_over_c2_logfc = log2FoldChange) %>%
herseman's avatar
herseman committed
477
        mutate(condition_1 = condition_1, condition_2 = condition_2) #%>% head %>% print_head
478
})) %>% bind_rows
Holger Brandl's avatar
Holger Brandl committed
479

Holger Brandl's avatar
Holger Brandl committed
480

481 482
# deResults = adply(contrasts, 1,  splat(function(condition_1, condition_2){
#  echo('samples', condition_1, condition_2)
Holger Brandl's avatar
Holger Brandl committed
483 484 485
#   return(data.frame())
# }))

486
deResults %>% ggplot(aes(c1_over_c2_logfc)) +
487
    geom_histogram(binwidth = 0.1) +
herseman's avatar
herseman committed
488
    facet_grid(condition_1 ~ condition_2) +
489 490
    geom_vline(xintercept = 0, color = "blue") +
    xlim(- 3, 3) +
491
    ggtitle("condition_1 over condition_2 logFC ")
Holger Brandl's avatar
Holger Brandl committed
492 493 494 495 496


#deResults %>% ggplot(aes(pvalue)) +
#    geom_histogram() +
#    xlim(0,1) +
497
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
498 499 500 501 502
#    ggtitle("p-value distributions") #+ scale_x_log10()
#
#deResults %>% ggplot(aes(padj)) +
#    geom_histogram() +
##    xlim(0,1) +
503
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
504 505 506 507 508
#    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


# report hit criterion
#+ results='asis'
509
if (! is.null(qcutoff)) {
Holger Brandl's avatar
Holger Brandl committed
510
    echo("Using q-value cutoff of", qcutoff)
511 512
    deResults %<>% transform(is_hit = padj <= qcutoff)
}else {
Holger Brandl's avatar
Holger Brandl committed
513
    echo("Using p-value cutoff of", pcutoff)
514
    deResults %<>% transform(is_hit = pvalue <= pcutoff)
Holger Brandl's avatar
Holger Brandl committed
515 516 517 518
}
#deResults %<>% mutate(is_hit=pvalue<0.05)


519
# filter(deResults, ensembl_gene_id=="ENSMUSG00000034957")
Holger Brandl's avatar
Holger Brandl committed
520

521
deResults %<>% mutate(c1_overex = c1_over_c2_logfc > 0)
522 523 524


normCounts = counts(dds, normalized = TRUE) %>%
Holger Brandl's avatar
Holger Brandl committed
525
#    set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
526 527 528
    set_names(colnames(countMatrix)) %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df
Holger Brandl's avatar
Holger Brandl committed
529 530


531
normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_counts_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
532 533 534 535 536

## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
537
#' ### MA and Volcano plots
Holger Brandl's avatar
Holger Brandl committed
538

herseman's avatar
herseman committed
539
#' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis ("M" for minus, because a log ratio is equal to log minus log, and "A" for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True).
Holger Brandl's avatar
Holger Brandl committed
540 541 542 543 544 545
#' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call.

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

# Calculation of the base means per condition
Holger Brandl's avatar
Holger Brandl committed
546 547 548
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html


549
meanNormCounts = counts(dds, normalized = TRUE) %>%
550 551
    as_df %>%
    rownames_to_column("ensembl_gene_id") %>%
552 553 554 555
    tbl_df %>%
    gather(replicate, norm_count, - ensembl_gene_id) %>%
    inner_join(exDesign) %>%
    group_by(ensembl_gene_id, condition) %>%
556 557
    summarize(mean_norm_count = mean(norm_count)) %>%
    ungroup()
558

Holger Brandl's avatar
Holger Brandl committed
559

herseman's avatar
herseman committed
560
## add base means to diff??ex summary
561
#numResultsBeforeMerge = nrow(deResults)
562 563
deResults = meanNormCounts %>%
    # gather(condition, norm_count, - ensembl_gene_id) %>%
564
    inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2")) %>%
565
#    filter(ac(condition_1)<ac(condition_2)) %>%
566
# add diffex status
567
    inner_join(deResults)
Holger Brandl's avatar
Holger Brandl committed
568 569

#+ fig.width=16, fig.height=14
570
deResults %>% ggplot(aes(0.5 * log2(mean_norm_count_1 * mean_norm_count_2), log2(mean_norm_count_2 / mean_norm_count_1), color = pvalue < 0.05)) +
571 572
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = 0, color = "red") +
herseman's avatar
herseman committed
573
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
574 575 576 577 578 579 580 581 582 583 584


#deResults %$% pvalue %>%  log10() %>% quantile(0.05, na.rm=T)

#' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test).
#' This scatter plot is used to quickly identify changes in large datasets composed of replicate data.
#' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis.
#' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center.
#' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top).
#' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29)

585
maxX = quantile(deResults$c1_over_c2_logfc, c(0.005, 0.99), na.rm = TRUE) %>%
586 587 588
    abs %>%
    max

Holger Brandl's avatar
Holger Brandl committed
589
##Volcano plots
590
hitCounts = filter(deResults, is_hit) %>%
591
    count(condition_1, condition_2, c1_overex) %>%
592
    rename(hits = n) %>%
593 594 595 596
## complement hit counts with directed contrasts without any diffex genes
    right_join(merge(mutate(contrasts), data_frame(c1_overex = c(T, F)), by = NULL)) %>%
    mutate(hits = if_else(is.na(hits), as.integer(0), hits)) %>%
    merge(data.frame(c1_overex = c(T, F), x_pos = c(maxX - maxX * 0.1, - maxX + maxX * 0.1)))
Holger Brandl's avatar
Holger Brandl committed
597

598
hitCounts %<>% mutate(y_pos = quantile(- log10(deResults$pvalue), 0.987, na.rm = TRUE))
599

Holger Brandl's avatar
Holger Brandl committed
600
#+ fig.width=16, fig.height=14
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616


## todo remove; part of commons v1.41
trim_outliers <- function(values, probs=c(0.05, 0.95)){
    values = deResults$pvalue
    stopifnot(length(probs) == 2)
    quantiles = quantile(values, probs, na.rm = TRUE)

    pmax(quantiles[1], pmin(quantiles[2], values))
}



deResults %>%
    mutate(p_trimmed = trim_outliers(pvalue, c(0.01, 1))) %>%
    filter(! is.na(is_hit)) %>%
617
    ggplot(aes(c1_over_c2_logfc, - log10(p_trimmed), color = is_hit)) +
618
# geom_jitter(alpha = 0.1, position = position_jitter(height = 0.2)) +
Holger Brandl's avatar
Holger Brandl committed
619
    geom_point(alpha = 0.1) +
Holger Brandl's avatar
Holger Brandl committed
620
#    theme_bw() +
621 622
    xlim(- maxX, maxX) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
Holger Brandl's avatar
Holger Brandl committed
623
#    ggtitle("Insm1/Ctrl") +
624 625 626
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
## tweak axis labels
Holger Brandl's avatar
Holger Brandl committed
627
    xlab(expression(log[2]("x/y"))) +
628
    ylab(expression(- log[10]("p value"))) +
Holger Brandl's avatar
Holger Brandl committed
629 630
#  ylim(0,50) +

631
## add hit couts
632
    geom_text(aes(label = hits, x = x_pos, y = y_pos), color = "red", size = 8, data = hitCounts) +
herseman's avatar
herseman committed
633
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
634 635


Holger Brandl's avatar
Holger Brandl committed
636
########################################################################################################################
637
# Count Table Exports
Holger Brandl's avatar
Holger Brandl committed
638 639


Holger Brandl's avatar
Holger Brandl committed
640
## optionally install bioconducor package
Holger Brandl's avatar
Holger Brandl committed
641
install_package("biomaRt")
Holger Brandl's avatar
Holger Brandl committed
642

Holger Brandl's avatar
Holger Brandl committed
643

644

Holger Brandl's avatar
Holger Brandl committed
645
# Load transcriptome annotations needed for results annotation
646

647 648
if (is.null(gene_info_file)) {
    ensembl_dataset = if (! is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl")  else guess_mart(countData$ensembl_gene_id)
649

650 651 652 653 654 655 656 657 658 659 660 661 662 663
    geneInfo = quote({
        ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))??
        #  mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
        ## todo fix this https://support.bioconductor.org/p/74322/
        # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
        # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
        mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)


        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart = mart) %>%
            tbl_df
    }) %>% cache_it("geneInfo")
} else {
664
    geneInfo = read_tsv(gene_info_file)
665 666 667
    colnames(geneInfo)[1] = "ensembl_gene_id"
}

Holger Brandl's avatar
Holger Brandl committed
668

669 670
geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position)
geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description)
herseman's avatar
herseman committed
671 672


Holger Brandl's avatar
Holger Brandl committed
673
## save as reference for downstream analysis
674
write_tsv(geneInfo, path = paste0(resultsBase, "geneInfo.txt"))
Holger Brandl's avatar
Holger Brandl committed
675 676


677
# Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
678 679
fpkms = countMatrix %>%
    as.data.frame %>%
Holger Brandl's avatar
Holger Brandl committed
680
    rownames_to_column("ensembl_gene_id") %>%
681 682
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
683
    left_join(geneLengths, by = "ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
684 685
## calculate normalized counts
    group_by(replicate) %>%
686
    mutate(lib_size = sum(num_tags), fpkm = ((1E9 / lib_size) * (num_tags / gene_length)) %>% round(digits = 3)) %>%
Holger Brandl's avatar
Holger Brandl committed
687 688 689 690 691 692
    ungroup()

#fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)

fpkms %>%
    transmute(ensembl_gene_id, replicate, fpkm) %>%
Holger Brandl's avatar
Holger Brandl committed
693
    spread(replicate, fpkm) %>%
694 695 696
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
697

698
# Aggregate FPKMs by condition and export into tabele
699
fpkms %>%
700
    inner_join(exDesign, by = "replicate") %>%
701
# group_by_(.dots = c(contrastAttribute, "ensembl_gene_id")) %>%
702 703
#     group_by(!!contrastAttribute, ensembl_gene_id) %>%
    group_by(condition, ensembl_gene_id) %>%
704 705 706 707 708 709
    summarize(mean_fpkm = mean(fpkm)) %>%
    ungroup() %>%
    spread(condition, mean_fpkm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_condition.txt"))
Holger Brandl's avatar
Holger Brandl committed
710

711
# [FPKMs as Matrix](`r paste0(resultsBase, "fpkms_by_condition.txt")`)
Holger Brandl's avatar
Holger Brandl committed
712

Holger Brandl's avatar
Holger Brandl committed
713

714
# Export counts per replicate as TPMs (transcript per million)
715
# calculation of tpms and generation of a wide table including gene description and the external gene name
Holger Brandl's avatar
Holger Brandl committed
716

717 718 719 720 721 722 723 724 725 726
tpms = countMatrix %>%
    as.data.frame %>%
    rownames_to_column("ensembl_gene_id") %>%
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
    left_join(geneLengths) %>%
    mutate(length_norm_count = num_tags / gene_length) %>%
    group_by(replicate) %>%
    mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
    ungroup()
Holger Brandl's avatar
Holger Brandl committed
727

728 729 730 731 732 733
tpms %>%
    select(ensembl_gene_id, replicate, tpm) %>%
    spread(replicate, tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "tpms_by_replicate.txt"))
herseman's avatar
herseman committed
734

735
# Also average TPMs per condition and export
736 737 738 739 740 741 742 743 744
tpms %>%
    inner_join(exDesign, by = "replicate") %>%
    group_by(condition, ensembl_gene_id) %>%
# group_by(condition, ensembl_gene_id) %>%
    summarize(mean_tpm = mean(tpm)) %>%
# select(ensembl_gene_id, replicate, gene_name, gene_description, mean_tpm) %>%
    spread(condition, mean_tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
Holger Brandl's avatar
Holger Brandl committed
745
    write_tsv(paste0(resultsBase, "tpms_by_condition.txt"))
herseman's avatar
herseman committed
746 747 748 749 750 751


#-----------------------------



Holger Brandl's avatar
Holger Brandl committed
752
# Export the complete dataset for later analysis
Holger Brandl's avatar
Holger Brandl committed
753
deAnnot = deResults %>% left_join(geneInfo)
754 755 756 757
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
Holger Brandl's avatar
Holger Brandl committed
758

Holger Brandl's avatar
Holger Brandl committed
759

760
write_tsv(deAnnot, path = paste0(resultsBase, "de_results.txt"))
761
#deAnnot = read_tsv(paste0(resultsBase, "de_results.txt"))
762
#  [de_results.txt](`r paste0(resultsBase, "de_results.txt")`)
Holger Brandl's avatar
Holger Brandl committed
763 764

## also export results which we'll always need for pathways overlays
765 766 767
deAnnot %>%
    transmute(
    ensembl_gene_id,
herseman's avatar
herseman committed
768
    contrast = paste(condition_1, "vs", condition_2),
769
    plot_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)
770 771
    ) %>%
    spread(contrast, plot_score) %>%
Holger Brandl's avatar
Holger Brandl committed
772
    write_tsv(paste0(resultsBase, "plot_score_matrix.txt"))
Holger Brandl's avatar
Holger Brandl committed
773 774 775 776

#deAnnot %>% filter(ensembl_gene_id=="FBgn0000015")


777
# subset data according to contrasts
Holger Brandl's avatar
Holger Brandl committed
778
if (sub_data) {
779 780 781
    dir.create("de_results")
    comp <- apply(contrasts, 1, paste0, collapse = "_vs_")
    lapply(comp, function(x) {
782 783 784 785 786
        deAnnot %>%
            mutate(contrast = paste(condition_1, condition_2, sep = "_vs_")) %>%
            filter(grepl(x, contrast)) %>%
            select(- contrast) %>%
            write_tsv(paste("de_results/", x, ".txt", sep = ""))
787
    })
788
}
789 790


Holger Brandl's avatar
Holger Brandl committed
791 792 793 794 795 796
## todo understand purpose and effeect of indpendentFiltering (see https://support.bioconductor.org/p/57128/)
#deResults %>% count(is_hit)
#deAnnot %>% count(is_hit)
#deAnnot %>% count(is.na(padj))


Holger Brandl's avatar
Holger Brandl committed
797

Lena Hersemann's avatar
Lena Hersemann committed
798
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
799
#' ### Count Outlier Analysis
Lena Hersemann's avatar
Lena Hersemann committed
800 801

#' For how many genes did Deseq did not assign a p-value (due to outliers; see section 1.5.3 os [DEseq manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-outlier-detection))
Holger Brandl's avatar
Holger Brandl committed
802 803


Holger Brandl's avatar
Holger Brandl committed
804 805 806 807 808 809 810 811 812 813

# summary(results(dds, cooksCutoff=FALSE))

#' See "3.6 Approach to count outliers" in the [DESEQ2 manual](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details about outlier handling
#'
#' Def: Cook's distance is a measure of how much a single sample is influencing the fitted coefficients for a gene, and a large value of Cook's distance is intended to indicate an outlier count
# par(mar=c(8,5,2,2))
# boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)


Holger Brandl's avatar
Holger Brandl committed
814 815
deAnnot %>%
    filter(is.na(pvalue)) %>%
herseman's avatar
herseman committed
816
    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
817
    count(contrast) %>%
Holger Brandl's avatar
Holger Brandl committed
818
    n_as("genes_with_NA_pvalue") %>%
819
    kable(caption = "Failed comparsions by sample")
Holger Brandl's avatar
Holger Brandl committed
820 821


Lena Hersemann's avatar
Lena Hersemann committed
822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
if ((deAnnot %>% filter(is.na(pvalue)) %>% nrow) > 0) {

    na_counts = norm_counts %>%
        as.data.frame() %>%
        rownames_to_column("ensembl_gene_id") %>%
        tbl_df %>%
        gather(replicate, norm_count, - ensembl_gene_id) %>%
        right_join(deAnnot %>%
            filter(is.na(pvalue)) %>%
            select(ensembl_gene_id, condition_1, condition_2)) %>%
    # glimpse
    # filter(ensembl_gene_id=="ENSMUSG00000000120")
        left_join(select(exDesign, replicate, condition)) %>%
        filter(condition_1 == condition | condition_2 == condition)

    na_counts %>%
        filter(ensembl_gene_id %in% unique(ensembl_gene_id)[1 : 6]) %>%
        ggplot(aes(condition, norm_count, label = replicate)) +
        geom_point() +
        geom_text() +
        facet_wrap(~ ensembl_gene_id, scales = "free_y") +
        ggtitle("exmple gene counts by replicate")

    na_counts %>%
        group_by(ensembl_gene_id) %>%
        filter(norm_count == max(norm_count)) %>%
        ggplot(aes(replicate)) +
        geom_bar() +
        coord_flip() +
        facet_grid(condition ~ ., scales = "free_y") +
        ylab("Genes without p-value in which sample has largest count") +
        ggtitle("Which replicates are causing most of the NAs")
}

########################################################################################################################
#' ## Hits Summary

Holger Brandl's avatar
Holger Brandl committed
859
# Extract hits from deseq results
Lena Hersemann's avatar
Lena Hersemann committed
860 861
degs = deAnnot %>% filter(is_hit)

862
# ggplot(degs, aes(paste(condition_1, "vs",  condition_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
Lena Hersemann's avatar
Lena Hersemann committed
863

Holger Brandl's avatar
Holger Brandl committed
864
# degs %>%
865
#     ggplot(aes(paste(condition_1, "vs",  condition_2), fill=c1_overex)) +
Holger Brandl's avatar
Holger Brandl committed
866 867 868 869 870 871 872
#     geom_bar(position="dodge") +
#     xlab(NULL) + ylab("# DBGs") +
#     ggtitle("DBG count summary by direction of expression") +
#     coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
873
#degsAnnot = degs %>%
Holger Brandl's avatar
Holger Brandl committed
874 875 876 877
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write_tsv(paste0(resultsBase, "diffex_genes.txt"))
878
#degs = read_tsv(paste0(resultsBase, "diffex_genes.txt"))
Holger Brandl's avatar
Holger Brandl committed
879 880 881 882 883 884
#' [Differentially Expressed Genes](`r paste0(resultsBase, "diffbind_genes.txt")`)


#unloadNamespace("dplyr"); require(dplyr)

## export slim hit list for downstream enrichment analysis
885
degs %>%
herseman's avatar
herseman committed
886
    transmute(ensembl_gene_id, contrast = paste(condition_1, "vs", condition_2)) %>%
887
    write_tsv(paste0(resultsBase, "degs_by_contrast.txt"))
Holger Brandl's avatar
Holger Brandl committed
888

889
degs %>%
890
    transmute(ensembl_gene_id, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>%
891 892
    write_tsv(paste0(resultsBase, "degs_by_contrast_directed.txt"))

Holger Brandl's avatar
Holger Brandl committed
893
## also export gsea inputs
894 895 896 897 898 899
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue)) %>% #count(is.na(rank_score))
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__undirected.txt"))
Holger Brandl's avatar
Holger Brandl committed
900

901 902 903
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
904
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)) %>%
905 906
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__directed.txt"))
Holger Brandl's avatar
Holger Brandl committed
907 908 909 910 911 912 913 914 915


# ## render interactive hit browser
# #+results='asis'
# degs %>%
#     left_join(geneInfo) %>%
#     mutate(
#         igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
#     ) %>%
916
#     select(c1_over_c2_logfc, pvalue, ensembl_gene_id, condition_1, condition_2, external_gene_name, description, igv) %>%
Holger Brandl's avatar
Holger Brandl committed
917 918 919
# #    kable("html", table.attr = "class='dtable'", escape=F)
#     datatable(escape=F)

Lena Hersemann's avatar
Lena Hersemann committed
920
#+ fig.height=nrow(contrasts)/2+2, eval=nrow(degs)>0
921 922
#ggplot(degs, aes(paste(condition_1, "vs",  condition_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) +
923 924 925 926 927
    geom_bar() +
    xlab(NULL) +
    ylab("# DGEs") +
    ggtitle("DEGs by contrast") +
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
928

929
#with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable()
930
degs %>%
931
    count(condition_1, condition_2, c1_overex) %>%
Holger Brandl's avatar
Holger Brandl committed
932
    mutate(c1_overex = ifelse(c1_overex, ("Up in condition_1"), "Down in condition_1")) %>%
933
    spread(c1_overex, n) %>%
934
    kable()
Holger Brandl's avatar
Holger Brandl committed
935 936 937 938 939 940 941


#'  DEGs (differentially expressed genes) are contained in the following table


#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
#+ results='asis'
Holger Brandl's avatar
Holger Brandl committed
942 943
# if (nrow(degs) < 3000) {
degs %>%
944 945
#inner_join(geneLoci) %>%
    mutate(
946 947
    ensembl = paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=", ensembl_gene_id, "'>", ensembl_gene_id, "</a>"),
    igv = paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name, ":", start_position, "-", end_position, "'>IGV</a>")
948
    ) %>%
949
    select(condition_1, condition_2, c1_over_c2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%
Holger Brandl's avatar
Holger Brandl committed
950
#  kable("html", table.attr = "class='dtable'", escape=F)
951
    datatable(escape = F)
Holger Brandl's avatar
Holger Brandl committed
952 953 954
# }


Holger Brandl's avatar
Holger Brandl committed
955 956 957

## just needed to restore environment easily
#save(degs, file=".degs.RData")
958
# degs = local(get(load(".degs.RData")))
Holger Brandl's avatar
Holger Brandl committed
959 960


Holger Brandl's avatar
Holger Brandl committed
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
# ########################################################################################################################
# #' ## MA Plots
#
# ## todo round of MA should be enough.
#
# # Redo MA plots but now including hit overlays
# maPlots = deResults %>%
#     group_by(condition_1, condition_2) %>%
#     do(gg = { maData <- .
#         ## todo why not s2_over_c1_log2fc
#         maData %>% ggplot(aes(0.5 * log2(norm_count_1 * norm_count_2), log2(norm_count_1 / norm_count_2), color = is_hit)) +
#             geom_point(alpha = 0.3) +
#             geom_hline(yintercept = 0, color = "red") +
#             ggtitle(with(maData[1,], paste(condition_1, "vs", condition_2)))
#     }) %$% gg
#
# #+ fig.height=6*ceiling(length(maPlots)/2)
# multiplot(plotlist = maPlots, cols = min(2, length(maPlots)))
#
#
981

Holger Brandl's avatar
Holger Brandl committed
982
#' Extract most significantly changed genes and display their gene-centered rlog-normalized expression
983 984 985 986 987 988 989 990 991 992

## see http://www.bioconductor.org/help/workflows/rnaseqGene/
# assay(rld) %>% head
# load_pack("genefilter")
# load_pack("pheatmap")
mostSig <- deAnnot %>% arrange(padj) %>% head(20) %$% ensembl_gene_id
mat <- assay(rld) %>%
    as.data.frame %>%
    rownames_to_column("ensembl_gene_id") %>%
    filter(ensembl_gene_id %in% mostSig) %>%
993
# as.data.frame() %>%
994 995 996 997 998
    column2rownames("ensembl_gene_id") %>%
    as.matrix()
mat <- mat - rowMeans(mat)
pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index))

999 1000 1001 1002 1003 1004 1005 1006 1007 1008

########################################################################################################################
#' ## Exported Data
#'
#' | File | Description |
#' |------|------|
#' | [diffex_genes.txt](diffex_genes.txt) | list of all differentially expressed genes from the DESeq analysis - That's the file you are most likely looking for! |
#' | [de_results.txt](de_results.txt) | list of all genes from the DESeq analysis |
#' | [fpkms_by_condition.txt](fpkms_by_condition.txt) | fpkm values of the different conditions |
#' | [fpkms_by_replicate.txt](fpkms_by_replicate.txt) | fpkm values of individual replicates |
Holger Brandl's avatar
Holger Brandl committed
1009
#' | [tpms_by_condition.txt](tpms_by_condition.txt) | tpm values of the different conditions |
1010 1011 1012 1013 1014 1015
#' | [tpms_by_replicate.txt](tpms_by_replicate.txt) | tpm values of individual replicates |
#' | [geneInfo.txt](geneInfo.txt) | general gene information  |
#'
#' Further informtion on [FPKM and TPM](http://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/)


1016 1017
##################################
# get R version and package infos
1018 1019
# writeLines(capture.output(sessionInfo()), ".sessionInfo.txt")
writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
1020

1021
# save copy of the used featcounts_deseq_mf.R script
1022
system("cp ${NGS_TOOLS}/dge_workflow/featcounts_deseq_mf.R ./.ngs_tools_featcounts_deseq_mf_$(date +%Y%m%d_%H%M).R")
1023

1024 1025
#################################

1026
session::save.session(".featcounts_deseq_mf.dat");
1027
# session::restore.session(".featcounts_deseq_mf.dat");
herseman's avatar
herseman committed
1028