dge_limma.R 20.6 KB
Newer Older
1 2
#!/usr/bin/env Rscript

3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#' # Differential Analysis Using Limma

#' This workflow is based on https://www.bioconductor.org/help/workflows/RNAseq123/

results_prefix = "limma_diffex"

#+ include=FALSE
suppressMessages(require(docopt))

doc = '
Perform a differential gene expression analysis using limma and edgeR
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>

Options:
--contrasts=<tab_delim_table>   Table with sample pairs for which dge analysis should be performed
18
--design <formula>              Design fomula for DeSeq with contrast attribute at the end [default: condition]
19 20 21 22 23 24 25 26 27 28 29 30 31
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>             Override q-value filter and filter by p-value instead
--out <name_prefix>             Name to prefix all generated result files [default: ]
--lfc <lfc_cutoff>              Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0]
--gene_info <info_file>         Gene information file downloaded with gene id in first column
'

# commandArgs = function(foo) c("--gene_info", "../mmus_ens_aug2017_uniprot_compl_gene_info.txt","--contrasts", "example_contrasts.txt", "../inten_matrix_acc.txt", "../diff_abund/ex_design.txt")
opts = docopt(doc, commandArgs(TRUE))

## load some packages first because of name-space order
library(knitr)

32 33
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.49/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.49/R/ggplot_commons.R")
34 35 36

## also load common helper for expression data analysis
# source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R"))
37
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/dge_workflow/diffex_commons.R")
38 39 40 41 42 43 44 45 46


## process input arguments
count_matrix_file = opts$count_matrix
design_matrix_file = opts$design_matrix
contrasts_file = opts$contrasts
gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")

47
designFormula = opts$design
48
assert(str_detect(designFormula, "^condition.*")) ## make sure that the condition comes before all batch factors
49

50
if (!is.null(opts$out)){  results_prefix = opts$out }
51 52 53 54 55 56 57 58

pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff)
if (is.numeric(pcutoff))opts$qcutoff = NULL

lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc)


59 60 61 62 63 64
#' Run configuration was
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    kable()

########################################################################################################################
65 66 67 68 69 70 71 72 73 74 75
#' ## Data Preparation

#' The working directory of the analysis was: `r getwd()`

## both batches combines
expDesign = read_tsv(design_matrix_file) %T>% glimpse
# countData = read_tsv(interp_from_env("../logms_inten_matrix_acc.txt")) %T>% glimpse
countData = read_tsv(count_matrix_file) %T>% glimpse

names(countData)[1] = "gene_id"

76 77 78 79 80 81 82 83 84 85
if (! is.null(contrasts_file)) {
    contrasts = read_tsv(contrasts_file) %T>% { print(kable(.))}
}else {
    contrasts = select(expDesign, condition) %>% distinct %>%
        merge(., ., suffixes = c("_1", "_2"), by = NULL) %>%
        filter(ac(condition_1) > ac(condition_2))
    # write_tsv(contrasts, paste(resultsBase, "contrasts.txt"))
}


86

87 88 89
# zero-imputation is disabled here because this should be better implement per experiment
# countData %<>% mutate_if(is.numeric, funs(replace(., is.na(.), 0)))

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
########################################################################################################################
#' ## QC, Normalization and Preprocessing


cdLong = gather(countData, sample, expr, - gene_id) %T>% glimpse

cdLong %>% ggplot(aes(sample)) +
    geom_bar() +
    coord_flip() +
    ggtitle("measurements per samples and sample")


cdLong %>%
    group_by(sample) %>%
    summarize(total = sum(expr)) %>%
    ggplot(aes(sample, weight = total)) +
    geom_bar() +
    coord_flip()


load_pack(limma)
load_pack(edgeR)

# x <- readDGE(files, columns=c(1,3))


#' create DGEList object
117 118 119
expMatrix = countData %>%
    column_to_rownames("gene_id") %>%
    as.matrix
120 121 122 123 124 125 126 127 128 129 130 131


#' remove lowely expressed genes
# keep.exprs <- rowSums(cpm>1)>=3
# x <- x[keep.exprs,, keep.lib.sizes=FALSE]


# load_pack(mice)
# complete(mice(expMatrix))
expMatrix = expMatrix[complete.cases(expMatrix),]


132 133 134
group_labels = data_frame(replicate = colnames(expMatrix)) %>%
    left_join(expDesign) %>%
    pull(condition)
135 136 137
names(group_labels) = colnames(expMatrix)
makePcaPlot(t(expMatrix), color_by = group_labels, title = "PCA of quantifiable proteins in all conditions")

138 139
#' Also do a scatter plot matrix of the PCs
mydata.pca = prcomp(t(expMatrix), retx = TRUE, center = TRUE, scale. = FALSE)
140

141 142 143 144 145 146 147 148 149 150 151 152

# screeplot(mydata.pca)
# devtools::install_github("vqv/ggbiplot")
# require(ggbiplot)
ggbiplot::ggscreeplot(mydata.pca) + geom_col()

# load_pack(GGally)
pcs = mydata.pca$x %>% as_df %>% rownames_to_column("sample")
pcs %>% GGally::ggpairs(columns=2:6, mapping=ggplot2::aes(color=sample), upper="blank", legend=c(3,3)) + theme(legend.position = "bottom")


#' Also analyze spearman correlation
153 154 155 156 157 158 159 160 161 162 163 164 165
correlation = cor(expMatrix, method = "spearman")
library(lattice)
levelplot(correlation, scales = list(x = list(rot = 90)), pretty = TRUE, main = "Spearman correlation between conditions after Normalization", xlab = "Conditions", ylab = "Conditions")


install_package("cummeRbund")
res = cummeRbund::JSdist(cummeRbund::makeprobs(expMatrix)) %>%
    hclust() %>%
    as.dendrogram()
par(mar = c(10, 4, 4, 4))
plot(res, main = "Sample Clustering of Expressed Genes")


166
repDist = as.dist(1 - correlation)
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
hclust(repDist) %>% plot

#rownames(repDist) = colnames(expMatrix)
#colnames(repDist) = colnames(expMatrix)
load_pack(d3heatmap)
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
repDist %>% d3heatmap(xaxis_height = 1)


# load_pack(GGally)
# cdLong %>%
#     select(gene_id, sample, expr) %>%
#     spread(sample, expr) %>% select(-gene_id) %>%
#     ggpairs()

########################################################################################################################
#' ## Data normalization

#source("https://bioconductor.org/biocLite.R")

load_pack(limma)
load_pack(edgeR)

orderMatcheExpDesign = data_frame(replicate = colnames(expMatrix)) %>%
    mutate(col_index = row_number()) %>%
    right_join(expDesign, by = "replicate") %>%
    arrange(col_index)

196 197 198 199 200 201
#' Build design matrix
#' > A key strength of limma’s linear modelling approach, is the ability accommodate arbitrary experimental complexity. Simple designs, such as the one in this workflow, with cell type and batch, through to more complicated factorial designs and models with interaction terms can be handled relatively easily
#'
#' Make sure that non of the batch-factors  is confounded with treatment (condition). See https://support.bioconductor.org/p/39385/ for a discussion
#' References
#' * https://f1000research.com/articles/5-1408/v1
202

203 204
# design <- orderMatcheExpDesign %$% model.matrix(~ 0 +  condition + prep_day)
design = orderMatcheExpDesign %$% model.matrix(formula(as.formula(paste("~0+", designFormula))))
205 206 207 208 209 210
rownames(design) <- orderMatcheExpDesign$replicate

#design <- model.matrix(~0+group+lane)
#colnames(design) <- gsub("group", "", colnames(design))
#design

211 212 213
expMatrix = countData %>%
    column_to_rownames("gene_id") %>%
    as.matrix
214 215 216 217

#exp_study = DGEList(counts=column2rownames(countsMatrix, "ensembl_gene_id"), group=names(countsMatrix)[-1])
exp_study = DGEList(counts = expMatrix, group = orderMatcheExpDesign$condition)

218
# par(mfrow=c(1,2)) ## 2panel plot for mean-var relationship before and after boom
219

220
#' Removing heteroscedasticity from count data
221 222 223 224 225
voomNorm <- voom(exp_study, design, plot = TRUE)
str(voomNorm)
# exp_study$counts is equilvaent to voomNorm$E see https://www.bioconductor.org/help/workflows/RNAseq123/


226 227 228 229 230 231
voomMat = voomNorm$E
group_labels = data_frame(replicate = colnames(voomMat)) %>%
    left_join(expDesign) %>%
    pull(condition)
names(group_labels) = colnames(voomMat)
makePcaPlot(t(voomMat), color_by = group_labels, title = "Normalized PCA of quantifiable proteins in all conditions")
232

233

234 235
#' Corrleate normalized data with raw expression
inner_join(expr_matrix_to_df(expMatrix) , expr_matrix_to_df(voomNorm$E), suffix = c("_raw", "_voom"), by = c("gene_id", "replicate")) %>%
236
    sample_frac(0.1) %>%
237 238 239 240
    ggplot(aes(expression_raw, expression_voom)) +
    geom_point() +
    scale_x_log10() +
    ggtitle("voom vs raw")
241 242

#' build contrasts
243 244 245 246 247 248 249
#contr.matrix <- makeContrasts(
#    BasalvsLP = Basal-LP,
#    BasalvsML = Basal-ML,
#    LPvsML = LP-ML,
#    levels = colnames(design))

#' contrasts matrix was
250
contr.matrix = makeContrasts(contrasts = with(contrasts, paste0("condition", condition_1, "-", "condition", condition_2)), levels = colnames(design))
251 252
contr.matrix

253
#' ## Model Fitting & Moderated t-test
254 255 256 257 258 259

#' Linear modelling in limma is carried out using the lmFit and contrasts.fit functions originally written for application to microarrays. The functions can be used for both microarray and RNA-seq data and fit a separate model to the expression values for each gene. Next, empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability  (source: RNAseq123)
vfit <- lmFit(voomNorm, design)
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
efit <- eBayes(vfit)

260
plotSA(efit, main = "Final model: Mean−variance trend")
261 262 263 264 265 266 267 268 269 270




#' Extract test statistics

#' Examining the number of DE genes before biological relevance filtering
summary(decideTests(efit)) %>% as_df %>% kable

#' Some studies require more than an adjusted p-value cut-off. For a stricter definition on significance, one may require log-fold-changes (log-FCs) to be above a minimum value. The treat method (McCarthy and Smyth 2009) can be used to calculate p-values from empirical Bayes moderated t-statistics with a minimum log-FC requirement.
271
tfit <- treat(vfit, lfc = lfc_cutoff)
272 273 274 275 276 277
dt <- decideTests(tfit)
summary(dt)
summary(dt) %>% as_df %>% kable

#' plot MA per contrast
numContrasts = length(colnames(tfit))
278 279
par(mfrow=c(1,1))
# par(mfrow = c(4, numContrasts)) ## 2panel plot for mean-var relationship before and after boom
280
# colnames(tfit) %>% iwalk(~ plotMD(tfit, column=.y, status=dt[,.y], main=.x, xlim=c(-8,13)))
281
colnames(tfit) %>% iwalk(~ plotMD(tfit, column = .y, status = dt[, .y], main = .x))
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300


# load_pack("Glimma")
# glMDPlot(tfit) # -> just a clickable ma plot with table
# glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1], side.main="ENTREZID", counts=x$counts, groups=group, launch=FALSE)



# diffexData = colnames(contr.matrix) %>% imap(function(contrast, cIndex){ print(cIndex)})

# basal.vs.lp <- topTreat(tfit, coef=2, n=Inf)
deResults = colnames(contr.matrix) %>%
    imap(function(contrast, cIndex){
        #DEBUG contrast="conditione12-conditione16"; cIndex=1
        topTreat(tfit, coef = cIndex, n = Inf) %>%
            tbl_df %>%
            rownames_to_column("gene_id") %>%
            mutate(contrast = str_replace_all(contrast, "condition", "")) %>%
            separate(contrast, c("condition_1", "condition_2"), "-") %>%
301
        # push_left("contrast") %>%
302 303 304 305 306 307 308 309 310
            pretty_columns
    }) %>%
    bind_rows()


#
# #' calculate sample means
# https://support.bioconductor.org/p/62541/

311 312 313 314 315 316
sampleMeans = voomNorm$E %>%
    as_df %>%
    rownames_to_column("gene_id") %>%
    tbl_df %>%
    gather(replicate, norm_count, - gene_id) %>%
    left_join(expDesign) %>%
317 318 319
    group_by(gene_id, condition) %>%
    summarize(mean_expr = mean(norm_count, na.rm = T)) %>%
    ungroup() %>%
320
# spread(condition, mean_pep)
321
    inner_join(., ., by = "gene_id", suffix = c("_1", "_2")) #%>%
322 323 324
#    filter(ac(condition_1)<ac(condition_2)) %>%
# add diffex status
# inner_join(deResults)
325 326 327 328 329 330 331 332 333 334 335 336 337 338


deResults %<>% left_join(sampleMeans)

#logFcs = contrasts %>% rowwise %>% do(with(., {
#    # condition_1="Q80"; condition_2="Q20"
#    sampleMeans %>% select_("gene_id", condition_1, condition_2) %>%
#        set_names("gene_id", "condition_1_mean", "condition_2_mean") %>% group_by(gene_id) %>%
#        transmute(logfc=log2(condition_1_mean/condition_2_mean), condition_1=condition_1, condition_2=condition_2)
#}))


#' apply the hit criterion

339 340
deResults %>% ggplot(aes(p_value)) + geom_histogram(binwidth=.01)
deResults %>% ggplot(aes(adj_p_val)) + geom_histogram(binwidth=.01)
341 342 343 344 345

# report hit criterion
#+ results='asis'
if (! is.null(qcutoff)) {
    echo("Using q-value cutoff of", qcutoff)
346
    deResults %<>% transform(is_hit = adj_p_val <= qcutoff)
347 348
}else {
    echo("Using p-value cutoff of", pcutoff)
349
    deResults %<>% transform(is_hit = p_value <= pcutoff)
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
}

#+
deResults %<>% mutate(c1_overex = logfc > 0)

deResults %>%
    count(condition_1, condition_2, is_hit) %>%
    filter(is_hit)

########################################################################################################################
#' ## Annotate results

if (is.null(gene_info_file)) {
    ensembl_dataset = if (! is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl")  else guess_mart(countData$ensembl_gene_id)

    geneInfo = quote({
        ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))??
        #  mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
        ## todo fix this https://support.bioconductor.org/p/74322/
        # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
        # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
        mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)


374
        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
375
            biomaRt::getBM(mart = mart) %>%
376 377
            tbl_df %>%
            rename(gene_id = ensembl_gene_id)
378
    }) %>% cache_it("geneInfo")
379 380 381

    # geneLengths = transmute(geneInfo, gene_id, gene_length = end_position - start_position)
    geneDescs = transmute(geneInfo, gene_id, gene_name = external_gene_name, gene_description = description)
382
} else {
383 384 385 386 387 388 389 390
    if(gene_info_file != "NA"){
        geneInfo = read_tsv(gene_info_file)
        colnames(geneInfo)[1] = "gene_id"
    }else{
        # no info --> create dummy info
        geneInfo = distinct(countData, gene_id)
    }

391
    geneDescs = geneInfo
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
}


write_tsv(geneInfo, path = add_prefix("geneInfo.txt"))


deAnnot = deResults %>% left_join(geneInfo)

########################################################################################################################
#' ## Results & Discussion

## DEBUG-START
#if(F){
#deResults %>% table_browser
#
#paNorm %>% filter(gene_id=="ENSMUSG00000015970") %>% ggplot(aes(sample, expr)) + geom_point()
#
## debug p adjustment
#deResults %>% ggplot(aes(q_value, p_value)) +
#geom_point(alpha=0.3) +
#facet_grid(condition_1 ~ condition_2)
#}
## DEBUG-END


#' Are log2 distributions symmetric around 0 (because of globally we'd expect no change in abundance)
deResults %>% ggplot(aes(logfc)) +
    geom_histogram() +
    facet_grid(condition_1 ~ condition_2) +
    geom_vline(xintercept = 0, color = "blue") +
# xlim(-2,2) +
423
    ggtitle("condition_1 over condition_2 logFC ")
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462


## Extract hits from deseq results
degs = deAnnot %>% filter(is_hit)

degs %>%
    transmute(gene_id, contrast = paste(condition_1, "vs", condition_2)) %>%
    write_tsv(add_prefix("degs_by_contrast.txt"))


# degs %>%
#     count(condition_1, condition_2, c1_overex) %>%
#     kable()
degs %>%
    count(condition_1, condition_2, c1_overex) %>%
    mutate(c1_overex = ifelse(c1_overex, ("Up in condition_1"), "Down in condition_1")) %>%
    spread(c1_overex, n) %>%
    kable()

ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) +
    geom_bar(position = "dodge") +
    xlab(NULL) +
    ylab("# DGEs") +
    ggtitle("DEGs by contrast") +
    coord_flip()


#with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable()

maxX = quantile(deResults$logfc, c(0.005, 0.99), na.rm = TRUE) %>%
    abs %>%
    max
maxY = quantile(log10(deResults$p_value), c(0.005, 0.99), na.rm = TRUE) %>%
    abs %>%
    max

hitCounts = filter(deResults, is_hit) %>%
    count(condition_1, condition_2, c1_overex) %>%
    rename(hits = n) %>%
463
    merge(data.frame(c1_overex = c(F, T), x_pos = c(- maxX * 0.9, maxX * 0.9)))
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483



#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(logfc, - log10(p_value), color = is_hit)) +
    geom_jitter(alpha = 0.2, position = position_jitter(height = 0.2)) +
#    theme_bw() +
    xlim(- 3, 3) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
#    ggtitle("Insm1/Ctrl") +
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
## tweak axis labels
    xlab(expression(log[2]("condition_1/condition_2"))) +
    ylab(expression(- log[10]("p value"))) +
    xlim(- maxX, maxX) +
    ylim(0, maxY) +
#  ylim(0,50) +

## add hit couts
484
    geom_text(aes(label = hits, x = x_pos), y = maxY * 0.9, color = "red", size = 10, data = hitCounts) +
485 486 487 488
    facet_grid(condition_1 ~ condition_2)

#' Redo MA plots but now including hit overlays

489 490 491 492 493 494
meansWide = voomNorm$E %>%
    as_df %>%
    rownames_to_column("gene_id") %>%
    tbl_df %>%
    gather(replicate, norm_count, - gene_id) %>%
    left_join(expDesign) %>%
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
    group_by(gene_id, condition) %>%
    summarize(mean_expr = mean(norm_count, na.rm = T)) %>%
    spread(condition, mean_expr)

maPlots = deResults %>%
    group_by(condition_1, condition_2) %>%
    do(gg = {
        #browser()
        maData <- .
        # maData <- deResults %>% group_by(condition_1, condition_2) %>% first_group()

        calc_mean_product = lazyeval::interp(~ a * b, a = as.name(maData$condition_1[1]), b = as.name(maData$condition_2[1]))
        meanProducts = meansWide %>% transmute_(.dots = setNames(list(calc_mean_product), "mean_prod"))

        maData %>%
            left_join(meanProducts) %>%
        #    ggplot(aes(0.5*log2(norm_count_1*norm_count_2), logfc, color=is_hit)) +
            ggplot(aes(0.5 * log2(mean_prod), logfc, color = is_hit)) +
            geom_point(alpha = 0.3) +
            geom_hline(yintercept = 0, color = "red") +
            ggtitle(with(maData[1,], paste(condition_1, "vs", condition_2)))
    }) %$% gg

#+ fig.height=6*ceiling(length(maPlots)/2)
load_pack(grid)

multiplot(plotlist = maPlots, cols = min(2, length(maPlots)))

########################################################################################################################
#' ## Export results

write_tsv(deAnnot, path = add_prefix("de_results.txt"))

degs %>% write_tsv(add_prefix("diffex_genes.txt"))

degs %>%
    transmute(gene_id, contrast = paste(condition_1, "vs", condition_2)) %>%
    write_tsv(add_prefix("degs_by_contrast.txt"))

degs %>%
    transmute(gene_id, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>%
    write_tsv(add_prefix("degs_by_contrast_directed.txt"))

#' Export voom normalization scores per replicate
539 540 541
voomNorm$E %>%
    as_df %>%
    rownames_to_column("gene_id") %>%
542
    left_join(geneDescs) %>%
543
# push_left(c("gene_name", "gene_description")) %>%
544 545 546 547
    write_tsv(add_prefix("norm_exp_by_replicate.txt"))


# Also average voom normalized expression scores per condition and export
548 549 550 551
voomNorm$E %>%
    as_df %>%
    rownames_to_column("gene_id") %>%
    gather(replicate, norm_expr, - gene_id) %>%
552 553 554 555
    inner_join(expDesign, by = "replicate") %>%
    group_by(condition, gene_id) %>%
    summarize(mean_norm_expr = mean(norm_expr)) %>%
    left_join(geneDescs) %>%
556
# push_left(c("gene_name", "gene_description")) %>%
557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
    write_tsv(add_prefix("norm_exp_by_condition.txt"))


#' | File | Description |
#' |------|------|
#' | [diffex_genes.txt](diffex_genes.txt) | list of all differentially expressed genes from the DESeq analysis - That's the file you are most likely looking for! |
#' | [de_results.txt](de_results.txt) | list of all genes from the DESeq analysis |
#' | [norm_exp_by_condition.txt](norm_exp_by_condition.txt) | tpm values of the different conditions |
#' | [norm_exp_by_replicate.txt](norm_exp_by_replicate.txt) | tpm values of individual replicates |
#' | [geneInfo.txt](geneInfo.txt) | general gene information  |
#'


writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")


573 574
session::save.session(".dge_limma.R.dat");
## session::restore.session(".dge_limma.R.dat")