featcounts_deseq_mf.R 43 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1 2 3 4 5 6
#!/usr/bin/env Rscript
#+ include=FALSE


suppressMessages(require(docopt))

7
doc = '
Holger Brandl's avatar
Holger Brandl committed
8 9 10 11
Perform a differntial gene expression analysis using deseq2
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>

Options:
Holger Brandl's avatar
Holger Brandl committed
12 13 14 15 16
--contrasts=<tab_delim_table>   Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>             Override q-value filter and filter by p-value instead
--min_count <min_count>         Minimal expression in any of the sample to be included in the final result list [default: 10]
--out <name_prefix>             Name to prefix all generated result files [default: ]
herseman's avatar
herseman committed
17
--design <formula>              Design fomula for DeSeq with contrast attribute at the end [default: condition]
18
--lfc <lfc_cutoff>              Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0]
19
--ensembl_db <ensembl_db>       Ensebmbl db to be used. If not specified inferred from data. TODO implement NONE here!!
20 21
--geneInfo <info_file>          Gene information file downloaded from the ensembl website if bioMart version is not present
--betaprior <boolean>           Apply betaprior when running DESeq2 [default: TRUE]
Holger Brandl's avatar
Holger Brandl committed
22 23
'

24
#commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt")
25
#commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts.txt", "--geneInfo", "../mart_export_CHOK1GS_HDv1_v90.txt",  "../alignments/star_counts_matrix.txt", "../basic_design.txt")
herseman's avatar
herseman committed
26

27
opts = docopt(doc, commandArgs(TRUE))
Holger Brandl's avatar
Holger Brandl committed
28 29


Holger Brandl's avatar
Holger Brandl committed
30 31
## load some packages first because of name-space order
library(knitr)
32 33
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")
Holger Brandl's avatar
Holger Brandl committed
34
library(DESeq2)
35
#load_pack(readr)
Holger Brandl's avatar
Holger Brandl committed
36 37


Holger Brandl's avatar
Holger Brandl committed
38 39
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/ggplot_commons.R")
40

Holger Brandl's avatar
Holger Brandl committed
41 42 43
## also load common helper for expression data analysis
# source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R"))
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
Holger Brandl's avatar
Holger Brandl committed
44

Holger Brandl's avatar
Holger Brandl committed
45 46

## process input arguments
47 48 49
count_matrix_file = opts$count_matrix
design_matrix_file = opts$design_matrix
contrasts_file = opts$contrasts
50 51
use_betaPrior = as.logical(opts$betaprior)
gene_info_file = opts$geneInfo
Holger Brandl's avatar
Holger Brandl committed
52

53
resultsBase = if (str_length(opts$out) > 0) paste0(opts$out, ".") else ""
Holger Brandl's avatar
Holger Brandl committed
54

55 56 57 58 59
pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff)
if (is.numeric(pcutoff))opts$qcutoff = NULL

lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc)
Holger Brandl's avatar
Holger Brandl committed
60

Holger Brandl's avatar
Holger Brandl committed
61
## extract the sample for the design
62
designFormula = opts$design
Holger Brandl's avatar
Holger Brandl committed
63
## consider last element of design formula as sample attribute
64 65 66
contrastAttribute = str_split(designFormula, "[+]") %>%
    unlist() %>%
    tail(1)
67 68
# assert(designFormula == "condition" || ! is.null(design_matrix_file), "custom designs are not supported without a design matrix")
assert(str_detect(designFormula, ".*condition$"))
Holger Brandl's avatar
Holger Brandl committed
69

Holger Brandl's avatar
Holger Brandl committed
70 71

#' # Differential Expression Analysis
Holger Brandl's avatar
Holger Brandl committed
72 73

#' Run configuration was
74 75 76
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    kable()
Holger Brandl's avatar
Holger Brandl committed
77

Holger Brandl's avatar
Holger Brandl committed
78 79 80 81 82 83 84 85 86
########################################################################################################################
#' ## Data Preparation

#' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input.
#' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene.
#' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability.

#' Working Directory: `r getwd()`

87
countData = read_tsv(count_matrix_file)
Holger Brandl's avatar
Holger Brandl committed
88 89

## save a backup into the current working directory as a reference
Holger Brandl's avatar
Holger Brandl committed
90
countData %>% write_tsv(paste0(resultsBase, "input_counts.txt"))
Holger Brandl's avatar
Holger Brandl committed
91 92

#' Structure of input matrix was
93
glimpse(countData, width = 60)
Holger Brandl's avatar
Holger Brandl committed
94

95 96 97
countMatrix = countData %>%
    column2rownames("ensembl_gene_id") %>%
    as.matrix()
Holger Brandl's avatar
Holger Brandl committed
98 99

# Expression Filtering
100
genesBefore = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
101
countMatrix %<>% filterByExpression(as.integer(opts$min_count))
102
genesAfter = nrow(countMatrix)
Lena Hersemann's avatar
Lena Hersemann committed
103

Holger Brandl's avatar
Holger Brandl committed
104 105 106 107 108 109 110 111
#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`.


########################################################################################################################

#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
#' DESeq uses a model based on the negative binomial distribution and offers the following features:
#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
112
#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly.
Holger Brandl's avatar
Holger Brandl committed
113 114 115 116 117 118 119
#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.

#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details

#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

Holger Brandl's avatar
Holger Brandl committed
120
## todo this is required now , so why all the checking
121
if (! is.null(design_matrix_file)) {
122
    replicates2samples = read_tsv(design_matrix_file) #%>% select(replicate, sample)
123
}else {
herseman's avatar
herseman committed
124
    warning("DEPRECATED extracting conditions from replicates is bad practise and a design file should be provided instead")
125
    get_sample_from_replicate = function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[, 2]
Holger Brandl's avatar
Holger Brandl committed
126

herseman's avatar
herseman committed
127
    replicates2samples = data_frame(replicate = colnames(countMatrix)) %>% mutate(condition = get_sample_from_replicate(replicate)) # %>% distinct_all()
Holger Brandl's avatar
Holger Brandl committed
128
}
Holger Brandl's avatar
Holger Brandl committed
129 130

# Define or load a contrasts matrix
131
if (! is.null(contrasts_file)) {
132
    contrasts = read_tsv(contrasts_file)
133 134
}else {
    assert(! is.null(replicates2samples), "Can not auto-create contrasts without design matrix") ## cant work yet because sample column with have random name
herseman's avatar
herseman committed
135 136
    contrasts = distinct_all(replicates2samples, condition) %>%
        select(condition) %>%
137
        merge(., ., suffixes = c("_1", "_2"), by = NULL) %>%
herseman's avatar
herseman committed
138
        filter(ac(condition_1) > ac(condition_2)) %>%
139
    #        filter(ac(condition_1)!=ac(condition_2)) %>%
Holger Brandl's avatar
Holger Brandl committed
140
        fac2char()
Holger Brandl's avatar
Holger Brandl committed
141 142

    write_tsv(contrasts, paste(resultsBase, "contrasts.txt"))
Holger Brandl's avatar
Holger Brandl committed
143 144
}

Holger Brandl's avatar
Holger Brandl committed
145 146

#' The used design matrix is
Holger Brandl's avatar
Holger Brandl committed
147
datatable(replicates2samples)
Holger Brandl's avatar
Holger Brandl committed
148

Holger Brandl's avatar
Holger Brandl committed
149 150 151 152 153
#' The used design formula is.
designFormula

#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)

Holger Brandl's avatar
Holger Brandl committed
154
#' The contrasts of interest are
Holger Brandl's avatar
Holger Brandl committed
155 156 157 158 159 160
contrasts %>% kable()

## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
161 162
#dds = makeExampleDESeqDataSet()
#dds = estimateSizeFactors( dds )
Holger Brandl's avatar
Holger Brandl committed
163 164 165
#sizeFactors(dds)

## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
Holger Brandl's avatar
Holger Brandl committed
166 167

## old non-mf approach
168
#exDesign = data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
Holger Brandl's avatar
Holger Brandl committed
169 170

## new mf-approach
171 172
exDesign = data_frame(replicate = colnames(countMatrix)) %>%
    mutate(col_index = row_number()) %>%
173
    right_join(replicates2samples, by = "replicate") %>%
174
    arrange(col_index) #%>% transmute(condition=condition_2ndwt) %>% fac2char
Holger Brandl's avatar
Holger Brandl committed
175

176 177
assert(!any(is.na(exDesign$col_index)), "assay design file is not compatible to count matrix column names")

178
exDesign %>% ggplot(aes(condition)) + geom_bar() + ylab("# samples") + coord_flip() + ggtitle("samples per condition")
179

Holger Brandl's avatar
Holger Brandl committed
180
## infer the sample to be used from the formula
181
#designFormula = "condition_2ndwt + batch"
Holger Brandl's avatar
Holger Brandl committed
182 183


184
#exDesign %<>% rename(sample, condition_2ndwt)
Holger Brandl's avatar
Holger Brandl committed
185

186
#exDesign = replicates2samples %>%  arrange(colnames(countMatrix))
Holger Brandl's avatar
Holger Brandl committed
187 188

# valiadate that contrasts model is compatible with data
189 190
assert(setequal(names(contrasts), c("condition_1", "condition_2")), "contrasts matrix has incorrect column headers. Expected: condition_1, condition_2")

191 192 193 194 195 196
designSamples = as_df(exDesign)[, contrastAttribute] %>%
    unique %>%
    sort
contrastSamples = contrasts %>% gather %$% value %>%
    unique %>%
    sort
197

198 199 200
if (! all(contrastSamples %in% designSamples)) {
    warning("design   : ", paste(designSamples, collapse = ", "))
    warning("contrasts: ", paste(contrastSamples, collapse = ", "))
Holger Brandl's avatar
Holger Brandl committed
201
    stop("experiment design is not consistent with contrasts")
Holger Brandl's avatar
Holger Brandl committed
202 203
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
Holger Brandl's avatar
Holger Brandl committed
204

Holger Brandl's avatar
Holger Brandl committed
205
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
206 207 208
dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds = estimateSizeFactors(dds)
#dds = estimateDispersions(dds)
209 210 211
# dds = DESeq(dds, parallel = T)

dds = DESeq(dds, parallel = T, betaPrior = use_betaPrior)
Holger Brandl's avatar
Holger Brandl committed
212

213

Holger Brandl's avatar
Holger Brandl committed
214 215


Holger Brandl's avatar
Holger Brandl committed
216 217 218
########################################################################################################################
#' ## Quality Control

Holger Brandl's avatar
Holger Brandl committed
219 220 221 222
#' ### Size Factors

##' Size Factors estimated by DESeq
sizeFactors(dds) %>%
Holger Brandl's avatar
Holger Brandl committed
223 224
#    set_names(colnames(countMatrix)) %>%
    vec_as_df("replicate", "size_factor") %>%
225 226 227 228
    ggplot(aes(replicate, size_factor)) +
    geom_bar(stat = "identity") +
    ggtitle("deseq size factors") +
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
229 230 231 232 233 234 235 236

# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.


#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/

##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.

Holger Brandl's avatar
Holger Brandl committed
237 238 239 240 241 242 243
#' ### Data Dispersion

#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
244
plotDispEsts(dds, main = "Dispersion plot")
245

Holger Brandl's avatar
Holger Brandl committed
246 247


Holger Brandl's avatar
Holger Brandl committed
248 249 250 251
########################################################################################################################
#' ### PCA and Clustering

#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
herseman's avatar
herseman committed
252
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment's design?
Holger Brandl's avatar
Holger Brandl committed
253 254 255 256 257 258 259
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.


#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.

# Regularized log transformation for clustering/heatmaps, etc
260
rld = rlogTransformation(dds)
261
#varstab = vst(dds) ## vst is supposed to be much 1k x faster
Holger Brandl's avatar
Holger Brandl committed
262

263 264 265 266
install_package("session")
session::save.session(".fc_debug.dat");
# session::restore.session(".fc_debug.dat");

Holger Brandl's avatar
Holger Brandl committed
267
#plotPCA(rld, intgroup = "sample")
268
# plotPCA(rld, intgroup = "batch")
269
# plotPCA(rld, intgroup = "run")
270 271 272 273 274 275 276
#designFormula %>%
#    str_split("[+]") %>%
#    unlist %>%
#    map(~ plotPCA(rld, intgroup = .x) +
#        ggtitle(paste("effect of ", .x)) +
#        guides(color = guide_legend(title = .x))) %>%
#    walk(print)
277
# plotPCA(rld, intgroup = contrastAttribute)
Holger Brandl's avatar
Holger Brandl committed
278 279


280 281 282
load_pack(limma)
load_pack(stringi)
load_pack(htmltools)
283 284
load_pack(plotly)
# install.packages("htmlwidgets")
285 286 287 288 289 290 291 292 293 294 295 296 297 298

# extract rlogTransformation normalized count data
norm_counts <- assay(rld)

#


#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
299
effects = designFormula %>% str_split("[+]") %>% unlist
300 301 302
# htmltools::tagList(lapply(effects, function(batchFactor) {
#     condition = pull(exDesign, batchFactor)
#     names(condition) = exDesign$replicate
Lena Hersemann's avatar
Lena Hersemann committed
303 304
#    suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
# makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly()
305 306 307 308 309 310 311 312 313 314
# }))


htmltools::tagList(lapply(effects, function(batchFactor) {
    # batchFactor=effects[1]
    condition = pull(exDesign, batchFactor)
    names(condition) = exDesign$replicate
    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
    {
        makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") +
Lena Hersemann's avatar
Lena Hersemann committed
315
        guides(color = guide_legend(title = batchFactor))
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
    } %>% ggplotly()
}))




# res <- lapply(1:3, function(i) {
#     fig <- list(data=list(list(x=rnorm(100), type="histogram")), layout=list(title=paste(c('plot', i, sep=" "))))
#     fig %>% offline
# })
# htmltools::tagList(res)




# extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame
Lena Hersemann's avatar
Lena Hersemann committed
332 333 334 335 336
adjvar <- designFormula %>%
    str_split("[+]") %>%
    unlist %>%
    stri_subset_regex("condition", negate = T)
adjvar_data <- data.frame(exDesign[, which(names(exDesign) %in% head(adjvar))])
337 338 339



Lena Hersemann's avatar
Lena Hersemann committed
340 341 342 343 344
if (length(adjvar_data) == 1) {
    corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1])
}else if (length(adjvar_data) == 2) {
    corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1], batch2 = adjvar_data[, 2])
}else if (length(adjvar_data) > 2) {
345
    print ("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.")
Lena Hersemann's avatar
Lena Hersemann committed
346
}else {
347 348 349 350 351 352 353 354 355 356 357 358
    print ("The design formula does not contain an adjustment variable for batch effect correction")
}


# for (i in colnames(adjvar_data)) {
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
#    condition = pull(exDesign, i)
#    names(condition) = exDesign$replicate
#    # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
#    makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print
#}

Lena Hersemann's avatar
Lena Hersemann committed
359 360 361 362 363 364 365 366 367 368 369 370 371 372
if (exists("corr_data")) {
    htmltools::tagList(lapply(effects, function(batchFactor) {
        # batchFactor=effects[1]
        condition = pull(exDesign, batchFactor)
        names(condition) = exDesign$replicate
        # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
        {
            makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") +
            guides(color = guide_legend(title = batchFactor))
        } %>% ggplotly()
    }))
}else {
    print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
373 374


Holger Brandl's avatar
Holger Brandl committed
375 376 377 378
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis

379 380 381 382 383
#distsRL = dist(t(assay(rld)))
#hc = hclust(distsRL)
#distMatrix = as.matrix(distsRL)
distMatrix = as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) = colnames(distMatrix) = with(exDesign(dds), paste(condition, sep=" : "))
Holger Brandl's avatar
Holger Brandl committed
384

Holger Brandl's avatar
Holger Brandl committed
385
countData %>% distinct(ensembl_gene_id)
386
labelcntData = countData %>%
387 388 389 390 391
    distinct_all(ensembl_gene_id) %>%
    column2rownames("ensembl_gene_id") %>%
    data.matrix() %>%
    round() %>%
    data.frame()
Holger Brandl's avatar
Holger Brandl committed
392

393 394 395
# load_pack(gplots)
# heatmap.2(distMatrix, labRow = colnames(labelcntData), labCol = colnames(labelcntData),
# symm = TRUE, trace = "none",
396
#key=F, col=colorpanel(100, "black", "white"),
397
# margin = c(8, 8), main = "Sample Distance Matrix")
Holger Brandl's avatar
Holger Brandl committed
398

399 400 401
rownames(distMatrix) = colnames(labelcntData)
colnames(distMatrix) = colnames(labelcntData)
load_pack(d3heatmap)
Holger Brandl's avatar
Holger Brandl committed
402 403
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
Lena Hersemann's avatar
Lena Hersemann committed
404
print("DESeq normalized count data WITHOUT batch correction")
405
distMatrix %>% d3heatmap(xaxis_height = 1)
Holger Brandl's avatar
Holger Brandl committed
406

Holger Brandl's avatar
Holger Brandl committed
407

Lena Hersemann's avatar
Lena Hersemann committed
408 409 410 411 412 413 414
if (exists("corr_data")) {
    print("DESeq normalized count data WITH batch correction")
    corr_distMatrix = as.matrix(dist(t(corr_data)))
    corr_distMatrix %>% d3heatmap(xaxis_height = 1)
}else {
    # print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction")
}
Holger Brandl's avatar
Holger Brandl committed
415

416 417
## see http://www.bioconductor.org/help/workflows/rnaseqGene/
## DEBUG-START
418 419 420 421
# if (F) {
assay(rld) %>% head
load_pack("genefilter")
load_pack("pheatmap")
422 423
mostSIg <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
mat <- assay(rld)[mostSIg,]
424 425 426 427 428 429 430
mat <- mat - rowMeans(mat)

# df <- as.data.frame(colData(rld)[,c("cell","dex")])
#pheatmap(mat, annotation_col=df)
# pheatmap(mat)
pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index))
# }
431 432
## DEBUG-END

Holger Brandl's avatar
Holger Brandl committed
433

Holger Brandl's avatar
Holger Brandl committed
434 435 436 437 438
########################################################################################################################
#' # Perform Differential Expression Analysis


#' Run Deseq Test
439 440 441
#dds = DESeq(dds, fitType='local', betaPrior=FALSE)
#dds = DESeq(dds, fitType='local')
#dds = DESeq(dds)
Holger Brandl's avatar
Holger Brandl committed
442

443
#res = results(dds)
Holger Brandl's avatar
Holger Brandl committed
444 445 446
#resultsNames(dds)

#' Model Overview:
Holger Brandl's avatar
Holger Brandl committed
447

Holger Brandl's avatar
Holger Brandl committed
448 449
summary(results(dds))

Holger Brandl's avatar
Holger Brandl committed
450 451 452 453
# summary(results(dds, cooksCutoff=FALSE))

#' See "3.6 Approach to count outliers" in the [DESEQ2 manual](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details about outlier handling
#'
herseman's avatar
herseman committed
454
#' Def: Cook's distance is a measure of how much a single sample is influencing the fitted coefficients for a gene, and a large value of Cook's distance is intended to indicate an outlier count
Holger Brandl's avatar
Holger Brandl committed
455 456 457
# par(mar=c(8,5,2,2))
# boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)

Holger Brandl's avatar
Holger Brandl committed
458 459

## extract all de-sets according to our contrasts model
460 461
# install_package("plyr")
# load_pack(IHW) --> todo reenable once installation on macos is fixed (see https://support.bioconductor.org/p/92731/)
462

Holger Brandl's avatar
Holger Brandl committed
463 464 465 466
# low count filtering https://support.bioconductor.org/p/65256/
# independent filtering occurs within results() to save you from multiple test correction on genes with no power
# (see ?results and the vignette section about independent filtering, or the paper). T

467
#deResults = alply(contrasts, 1,  splat(function(condition_1, condition_2){
herseman's avatar
herseman committed
468
deResults = plyr::alply(contrasts, 1, plyr::splat(function(condition_1, condition_2){
Lena Hersemann's avatar
Lena Hersemann committed
469
    #DEBUG condition_1=as_df(contrasts)[1,1]; condition_2=as_df(contrasts)[1,2]
470
    # results(dds, contrast=c(contrastAttribute, condition_1, condition_2)) %>%
471 472
    # results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff, filterFun = ihw) %>%
    results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff) %>%
473 474
        as_tibble() %>%
        tibble::rownames_to_column("ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
475
    ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
476
        rename(c1_over_c2_logfc = log2FoldChange) %>%
herseman's avatar
herseman committed
477
        mutate(condition_1 = condition_1, condition_2 = condition_2) #%>% head %>% print_head
478
})) %>% bind_rows
Holger Brandl's avatar
Holger Brandl committed
479

Holger Brandl's avatar
Holger Brandl committed
480

481 482
# deResults = adply(contrasts, 1,  splat(function(condition_1, condition_2){
#  echo('samples', condition_1, condition_2)
Holger Brandl's avatar
Holger Brandl committed
483 484 485
#   return(data.frame())
# }))

486
deResults %>% ggplot(aes(c1_over_c2_logfc)) +
487
    geom_histogram(binwidth = 0.1) +
herseman's avatar
herseman committed
488
    facet_grid(condition_1 ~ condition_2) +
489 490
    geom_vline(xintercept = 0, color = "blue") +
    xlim(- 3, 3) +
491
    ggtitle("condition_1 over condition_2 logFC ")
Holger Brandl's avatar
Holger Brandl committed
492 493


Holger Brandl's avatar
Holger Brandl committed
494

Holger Brandl's avatar
Holger Brandl committed
495 496 497 498 499 500
########################################################################################################################
#' ## Significnce of differential binding

#deResults %>% ggplot(aes(pvalue)) +
#    geom_histogram() +
#    xlim(0,1) +
501
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
502 503 504 505 506
#    ggtitle("p-value distributions") #+ scale_x_log10()
#
#deResults %>% ggplot(aes(padj)) +
#    geom_histogram() +
##    xlim(0,1) +
507
#    facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
508 509 510 511 512
#    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


# report hit criterion
#+ results='asis'
513
if (! is.null(qcutoff)) {
Holger Brandl's avatar
Holger Brandl committed
514
    echo("Using q-value cutoff of", qcutoff)
515 516
    deResults %<>% transform(is_hit = padj <= qcutoff)
}else {
Holger Brandl's avatar
Holger Brandl committed
517
    echo("Using p-value cutoff of", pcutoff)
518
    deResults %<>% transform(is_hit = pvalue <= pcutoff)
Holger Brandl's avatar
Holger Brandl committed
519 520 521 522
}
#deResults %<>% mutate(is_hit=pvalue<0.05)


523
# filter(deResults, ensembl_gene_id=="ENSMUSG00000034957")
Holger Brandl's avatar
Holger Brandl committed
524

525
deResults %<>% mutate(c1_overex = c1_over_c2_logfc > 0)
526 527 528


normCounts = counts(dds, normalized = TRUE) %>%
Holger Brandl's avatar
Holger Brandl committed
529
#    set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
530 531 532
    set_names(colnames(countMatrix)) %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df
Holger Brandl's avatar
Holger Brandl committed
533 534


535
normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_counts_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
536 537 538 539 540 541 542

## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

########################################################################################################################
#' # MA and Volcano plots

herseman's avatar
herseman committed
543
#' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis ("M" for minus, because a log ratio is equal to log minus log, and "A" for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True).
Holger Brandl's avatar
Holger Brandl committed
544 545 546 547 548 549
#' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call.

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

# Calculation of the base means per condition
Holger Brandl's avatar
Holger Brandl committed
550 551 552 553
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html

exDesign$replicate
#conSamplesOrdered = levels(dds$sample)
554
conSamplesOrdered = as_df(exDesign)[, contrastAttribute] %>% unique
Holger Brandl's avatar
Holger Brandl committed
555

Lena Hersemann's avatar
Lena Hersemann committed
556 557

baseMeanPerLvl = sapply(conSamplesOrdered, function(lvl) rowMeans(counts(dds, normalized = TRUE)[, conSamplesOrdered == lvl, drop = FALSE])) %>%
558 559 560
    as_df %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df
Holger Brandl's avatar
Holger Brandl committed
561

herseman's avatar
herseman committed
562
## add base means to diff??ex summary
563 564
#numResultsBeforeMerge = nrow(deResults)
deResults = baseMeanPerLvl %>%
herseman's avatar
herseman committed
565
    gather(condition, norm_count, - ensembl_gene_id) %>%
566
    inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2")) %>%
567
#    filter(ac(condition_1)<ac(condition_2)) %>%
568
# add diffex status
569
    inner_join(deResults)
Holger Brandl's avatar
Holger Brandl committed
570 571

#+ fig.width=16, fig.height=14
572 573 574
deResults %>% ggplot(aes(0.5 * log2(norm_count_1 * norm_count_2), log2(norm_count_2 / norm_count_1), color = pvalue < 0.05)) +
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = 0, color = "red") +
herseman's avatar
herseman committed
575
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
576 577 578 579 580 581 582 583 584 585 586


#deResults %$% pvalue %>%  log10() %>% quantile(0.05, na.rm=T)

#' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test).
#' This scatter plot is used to quickly identify changes in large datasets composed of replicate data.
#' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis.
#' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center.
#' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top).
#' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29)

587
maxX = quantile(deResults$c1_over_c2_logfc, c(0.005, 0.99), na.rm = TRUE) %>%
588 589 590
    abs %>%
    max

Holger Brandl's avatar
Holger Brandl committed
591
##Volcano plots
592
hitCounts = filter(deResults, is_hit) %>%
593
    count(condition_1, condition_2, c1_overex) %>%
594
    rename(hits = n) %>%
Lena Hersemann's avatar
Lena Hersemann committed
595
    ## complement hit counts with directed contrasts without any diffex genes
596
    right_join(merge(mutate(contrasts),data_frame(c1_overex=c(T,F)), by=NULL)) %>% mutate(hits=if_else(is.na(hits), as.integer(0), hits)) %>%
597
    merge(data.frame(c1_overex = c(T, F), x_pos = c(maxX - maxX*0.1, -maxX + maxX*0.1)))
Holger Brandl's avatar
Holger Brandl committed
598

599
hitCounts %<>% mutate(y_pos = quantile(- log10(deResults$pvalue), 0.987, na.rm = TRUE))
Holger Brandl's avatar
Holger Brandl committed
600

Holger Brandl's avatar
Holger Brandl committed
601
#+ fig.width=16, fig.height=14
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617


## todo remove; part of commons v1.41
trim_outliers <- function(values, probs=c(0.05, 0.95)){
    values = deResults$pvalue
    stopifnot(length(probs) == 2)
    quantiles = quantile(values, probs, na.rm = TRUE)

    pmax(quantiles[1], pmin(quantiles[2], values))
}



deResults %>%
    mutate(p_trimmed = trim_outliers(pvalue, c(0.01, 1))) %>%
    filter(! is.na(is_hit)) %>%
618
    ggplot(aes(c1_over_c2_logfc, - log10(p_trimmed), color = is_hit)) +
Holger Brandl's avatar
Holger Brandl committed
619 620
    # geom_jitter(alpha = 0.1, position = position_jitter(height = 0.2)) +
    geom_point(alpha = 0.1) +
Holger Brandl's avatar
Holger Brandl committed
621
#    theme_bw() +
622 623
    xlim(- maxX, maxX) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
Holger Brandl's avatar
Holger Brandl committed
624
#    ggtitle("Insm1/Ctrl") +
625 626 627
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
## tweak axis labels
Holger Brandl's avatar
Holger Brandl committed
628
    xlab(expression(log[2]("x/y"))) +
629
    ylab(expression(- log[10]("p value"))) +
Holger Brandl's avatar
Holger Brandl committed
630 631
#  ylim(0,50) +

632
## add hit couts
633
    geom_text(aes(label = hits, x = x_pos, y = y_pos), color = "red", size = 8, data = hitCounts) +
herseman's avatar
herseman committed
634
    facet_grid(condition_1 ~ condition_2)
Holger Brandl's avatar
Holger Brandl committed
635 636


Holger Brandl's avatar
Holger Brandl committed
637
########################################################################################################################
638
# Count Table Exports
Holger Brandl's avatar
Holger Brandl committed
639 640


Holger Brandl's avatar
Holger Brandl committed
641
## optionally install bioconducor package
Holger Brandl's avatar
Holger Brandl committed
642
install_package("biomaRt")
Holger Brandl's avatar
Holger Brandl committed
643

Holger Brandl's avatar
Holger Brandl committed
644

645 646
ensembl_dataset= if (!is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl")  else guess_mart(countData$ensembl_gene_id)

Holger Brandl's avatar
Holger Brandl committed
647
# Load transcriptome annotations needed for results annotation
648

649
if (is.null(gene_info_file)){
650 651 652 653 654 655 656 657 658 659 660 661 662 663
    geneInfo = quote({
        ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))??
        #  mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
        ## todo fix this https://support.bioconductor.org/p/74322/
        # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
        # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
        mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)


        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart = mart) %>%
            tbl_df
    }) %>% cache_it("geneInfo")
} else {
664
    geneInfo = read_tsv(gene_info_file)
665 666 667
    colnames(geneInfo)[1] = "ensembl_gene_id"
}

Holger Brandl's avatar
Holger Brandl committed
668

669 670
geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position)
geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description)
herseman's avatar
herseman committed
671 672


Holger Brandl's avatar
Holger Brandl committed
673
## save as reference for downstream analysis
674
write_tsv(geneInfo, path = paste0(resultsBase, "geneInfo.txt"))
Holger Brandl's avatar
Holger Brandl committed
675 676


677
# Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
678 679
fpkms = countMatrix %>%
    as.data.frame %>%
Holger Brandl's avatar
Holger Brandl committed
680
    rownames_to_column("ensembl_gene_id") %>%
681 682
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
683
    left_join(geneLengths, by = "ensembl_gene_id") %>%
Holger Brandl's avatar
Holger Brandl committed
684 685
## calculate normalized counts
    group_by(replicate) %>%
686
    mutate(lib_size = sum(num_tags), fpkm = ((1E9 / lib_size) * (num_tags / gene_length)) %>% round(digits = 3)) %>%
Holger Brandl's avatar
Holger Brandl committed
687 688 689 690 691 692
    ungroup()

#fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)

fpkms %>%
    transmute(ensembl_gene_id, replicate, fpkm) %>%
Holger Brandl's avatar
Holger Brandl committed
693
    spread(replicate, fpkm) %>%
694 695 696
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt"))
Holger Brandl's avatar
Holger Brandl committed
697

698
# Aggregate FPKMs by condition and export into tabele
699
fpkms %>%
700
    inner_join(exDesign, by = "replicate") %>%
701
# group_by_(.dots = c(contrastAttribute, "ensembl_gene_id")) %>%
702 703
#     group_by(!!contrastAttribute, ensembl_gene_id) %>%
    group_by(condition, ensembl_gene_id) %>%
704 705 706 707 708 709
    summarize(mean_fpkm = mean(fpkm)) %>%
    ungroup() %>%
    spread(condition, mean_fpkm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "fpkms_by_condition.txt"))
Holger Brandl's avatar
Holger Brandl committed
710

711
# [FPKMs as Matrix](`r paste0(resultsBase, "fpkms_by_condition.txt")`)
Holger Brandl's avatar
Holger Brandl committed
712

Holger Brandl's avatar
Holger Brandl committed
713

714
# Export counts per replicate as TPMs (transcript per million)
715
# calculation of tpms and generation of a wide table including gene description and the external gene name
Holger Brandl's avatar
Holger Brandl committed
716

717 718 719 720 721 722 723 724 725 726
tpms = countMatrix %>%
    as.data.frame %>%
    rownames_to_column("ensembl_gene_id") %>%
    gather(replicate, num_tags, - ensembl_gene_id) %>%
    tbl_df %>%
    left_join(geneLengths) %>%
    mutate(length_norm_count = num_tags / gene_length) %>%
    group_by(replicate) %>%
    mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
    ungroup()
Holger Brandl's avatar
Holger Brandl committed
727

728 729 730 731 732 733
tpms %>%
    select(ensembl_gene_id, replicate, tpm) %>%
    spread(replicate, tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "tpms_by_replicate.txt"))
herseman's avatar
herseman committed
734

735
# Also average TPMs per condition and export
736 737 738 739 740 741 742 743 744 745
tpms %>%
    inner_join(exDesign, by = "replicate") %>%
    group_by(condition, ensembl_gene_id) %>%
# group_by(condition, ensembl_gene_id) %>%
    summarize(mean_tpm = mean(tpm)) %>%
# select(ensembl_gene_id, replicate, gene_name, gene_description, mean_tpm) %>%
    spread(condition, mean_tpm) %>%
    left_join(geneDescs) %>%
    push_left(c("gene_name", "gene_description")) %>%
    write_tsv(paste0(resultsBase, "tpms_by_conditon.txt"))
herseman's avatar
herseman committed
746 747 748 749 750 751


#-----------------------------



Holger Brandl's avatar
Holger Brandl committed
752
# Export the complete dataset for later analysis
Holger Brandl's avatar
Holger Brandl committed
753
deAnnot = deResults %>% left_join(geneInfo)
754 755 756 757
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
Holger Brandl's avatar
Holger Brandl committed
758

Holger Brandl's avatar
Holger Brandl committed
759

760
write_tsv(deAnnot, path = paste0(resultsBase, "de_results.txt"))
761
#deAnnot = read_tsv(paste0(resultsBase, "de_results.txt"))
762
#  [de_results.txt](`r paste0(resultsBase, "de_results.txt")`)
Holger Brandl's avatar
Holger Brandl committed
763 764

## also export results which we'll always need for pathways overlays
765 766 767
deAnnot %>%
    transmute(
    ensembl_gene_id,
herseman's avatar
herseman committed
768
    contrast = paste(condition_1, "vs", condition_2),
769
    plot_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)
770 771
    ) %>%
    spread(contrast, plot_score) %>%
Holger Brandl's avatar
Holger Brandl committed
772
    write_tsv(paste0(resultsBase, "plot_score_matrix.txt"))
Holger Brandl's avatar
Holger Brandl committed
773 774 775 776 777 778 779 780 781 782

#deAnnot %>% filter(ensembl_gene_id=="FBgn0000015")


## todo understand purpose and effeect of indpendentFiltering (see https://support.bioconductor.org/p/57128/)
#deResults %>% count(is_hit)
#deAnnot %>% count(is_hit)
#deAnnot %>% count(is.na(padj))


Holger Brandl's avatar
Holger Brandl committed
783

Lena Hersemann's avatar
Lena Hersemann committed
784 785 786 787
########################################################################################################################
#' ## Count Outlier Analysis

#' For how many genes did Deseq did not assign a p-value (due to outliers; see section 1.5.3 os [DEseq manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-outlier-detection))
Holger Brandl's avatar
Holger Brandl committed
788 789


Holger Brandl's avatar
Holger Brandl committed
790 791
deAnnot %>%
    filter(is.na(pvalue)) %>%
herseman's avatar
herseman committed
792
    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
793 794
    count(contrast) %>%
    n_as("genes_with_NA_pvalue")
Holger Brandl's avatar
Holger Brandl committed
795 796


Lena Hersemann's avatar
Lena Hersemann committed
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
if ((deAnnot %>% filter(is.na(pvalue)) %>% nrow) > 0) {

    na_counts = norm_counts %>%
        as.data.frame() %>%
        rownames_to_column("ensembl_gene_id") %>%
        tbl_df %>%
        gather(replicate, norm_count, - ensembl_gene_id) %>%
        right_join(deAnnot %>%
            filter(is.na(pvalue)) %>%
            select(ensembl_gene_id, condition_1, condition_2)) %>%
    # glimpse
    # filter(ensembl_gene_id=="ENSMUSG00000000120")
        left_join(select(exDesign, replicate, condition)) %>%
        filter(condition_1 == condition | condition_2 == condition)

    na_counts %>%
        filter(ensembl_gene_id %in% unique(ensembl_gene_id)[1 : 6]) %>%
        ggplot(aes(condition, norm_count, label = replicate)) +
        geom_point() +
        geom_text() +
        facet_wrap(~ ensembl_gene_id, scales = "free_y") +
        ggtitle("exmple gene counts by replicate")

    na_counts %>%
        group_by(ensembl_gene_id) %>%
        filter(norm_count == max(norm_count)) %>%
        ggplot(aes(replicate)) +
        geom_bar() +
        coord_flip() +
        facet_grid(condition ~ ., scales = "free_y") +
        ylab("Genes without p-value in which sample has largest count") +
        ggtitle("Which replicates are causing most of the NAs")
}

########################################################################################################################
#' ## Hits Summary

#' Extract hits from deseq results
degs = deAnnot %>% filter(is_hit)

837
# ggplot(degs, aes(paste(condition_1, "vs",  condition_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
Lena Hersemann's avatar
Lena Hersemann committed
838

Holger Brandl's avatar
Holger Brandl committed
839
# degs %>%
840
#     ggplot(aes(paste(condition_1, "vs",  condition_2), fill=c1_overex)) +
Holger Brandl's avatar
Holger Brandl committed
841 842 843 844 845 846 847
#     geom_bar(position="dodge") +
#     xlab(NULL) + ylab("# DBGs") +
#     ggtitle("DBG count summary by direction of expression") +
#     coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
848
#degsAnnot = degs %>%
Holger Brandl's avatar
Holger Brandl committed
849 850 851 852
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write_tsv(paste0(resultsBase, "diffex_genes.txt"))
853
#degs = read_tsv(paste0(resultsBase, "diffex_genes.txt"))
Holger Brandl's avatar
Holger Brandl committed
854 855 856 857 858 859
#' [Differentially Expressed Genes](`r paste0(resultsBase, "diffbind_genes.txt")`)


#unloadNamespace("dplyr"); require(dplyr)

## export slim hit list for downstream enrichment analysis
860
degs %>%
herseman's avatar
herseman committed
861
    transmute(ensembl_gene_id, contrast = paste(condition_1, "vs", condition_2)) %>%
862
    write_tsv(paste0(resultsBase, "degs_by_contrast.txt"))
Holger Brandl's avatar
Holger Brandl committed
863

864
degs %>%
865
    transmute(ensembl_gene_id, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>%
866 867
    write_tsv(paste0(resultsBase, "degs_by_contrast_directed.txt"))

Holger Brandl's avatar
Holger Brandl committed
868
## also export gsea inputs
869 870 871 872 873 874
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue)) %>% #count(is.na(rank_score))
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__undirected.txt"))
Holger Brandl's avatar
Holger Brandl committed
875

876 877 878
#deAnnot %>%
#    mutate(contrast = paste(condition_1, "vs", condition_2)) %>%
#    filter(! is.na(pvalue)) %>%
879
#    transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)) %>%
880 881
##    arrange(contrast, rank_score) %>%
#    write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__directed.txt"))
Holger Brandl's avatar
Holger Brandl committed
882 883 884 885 886 887 888 889 890


# ## render interactive hit browser
# #+results='asis'
# degs %>%
#     left_join(geneInfo) %>%
#     mutate(
#         igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
#     ) %>%
891
#     select(c1_over_c2_logfc, pvalue, ensembl_gene_id, condition_1, condition_2, external_gene_name, description, igv) %>%
Holger Brandl's avatar
Holger Brandl committed
892 893 894 895 896 897
# #    kable("html", table.attr = "class='dtable'", escape=F)
#     datatable(escape=F)


#' ### DEG Counts

Lena Hersemann's avatar
Lena Hersemann committed
898 899
#+ fig.height=nrow(contrasts)/2+2, eval=nrow(degs)>0

900 901
#ggplot(degs, aes(paste(condition_1, "vs",  condition_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) +
902 903 904 905 906
    geom_bar() +
    xlab(NULL) +
    ylab("# DGEs") +
    ggtitle("DEGs by contrast") +
    coord_flip()
Holger Brandl's avatar
Holger Brandl committed
907

908
#with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable()
909
degs %>%
910
    count(condition_1, condition_2, c1_overex) %>%
911
    kable()
Holger Brandl's avatar
Holger Brandl committed
912 913 914 915 916 917


#'  DEGs (differentially expressed genes) are contained in the following table


#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
918
kableDegs = degs
Lena Hersemann's avatar
Lena Hersemann committed
919
if (nrow(degs) > 2000) {
920 921 922
    kableDegs = degs %>% arrange(padj) %>% head(1000)
    #  Error: object 'q_value' not found. So: padj used instead of '-q_value'
    print("just showing first 1000 most significant hits (highest p-value) in interactive hit table!!!! Use Excel to browser to browse the full set")
Holger Brandl's avatar
Holger Brandl committed
923 924 925 926
}

#+ results='asis'
kableDegs %>%
927 928 929 930 931
#inner_join(geneLoci) %>%
    mutate(
    ensembl = paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=", ensembl_gene_id, "'>", ensembl_gene_id, "</a>"),
    igv = paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name, ":", start_position, "-", end_position, "'>IGV</a>")
    ) %>%
932
    select(condition_1, condition_2, c1_over_c2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%
Holger Brandl's avatar
Holger Brandl committed
933
#  kable("html", table.attr = "class='dtable'", escape=F)
934
    datatable(escape = F)
Holger Brandl's avatar
Holger Brandl committed
935 936 937

## just needed to restore environment easily
#save(degs, file=".degs.RData")
938
# degs = local(get(load(".degs.RData")))
Holger Brandl's avatar
Holger Brandl committed
939 940 941 942 943


########################################################################################################################
#' ## MA Plots

Holger Brandl's avatar
Holger Brandl committed
944
## todo round of MA should be enough.
Holger Brandl's avatar
Holger Brandl committed
945

Holger Brandl's avatar
Holger Brandl committed