Commit 4a0a50b2 authored by Holger Brandl's avatar Holger Brandl

added support for custom designs to limma workflow

parent 6c28e105
#' # Differential Analysis Using Limma
#!/usr/bin/env Rscript
#' This workflow is based on https://www.bioconductor.org/help/workflows/RNAseq123/
#' # Differential Analysis Using Limma
results_prefix = "limma_diffex"
#' This workflow is based on https://www.bioconductor.org/help/workflows/RNAseq123/. Another nice example to lean about limma basics is https://www.bioconductor.org/help/course-materials/2005/BioC2005/labs/lab01/estrogen/
#+ include=FALSE
suppressMessages(require(docopt))
doc = '
Perform a differential gene expression analysis using limma and edgeR
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>
Usage: dge_limma.R [options] <count_matrix> <design_matrix>
Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--design <formula> Design fomula for DeSeq with contrast attribute at the end [default: condition]
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--out <name_prefix> Name to prefix all generated result files [default: ]
......@@ -41,6 +42,14 @@ contrasts_file = opts$contrasts
gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
## extract the sample for the design
designFormula = opts$design
## consider last element of design formula as sample attribute
# contrastAttribute = str_split(designFormula, "[+]") %>%
# unlist() %>%
# tail(1)
assert(str_detect(designFormula, ".*condition$"))
results_prefix = if (str_length(opts$out) > 0) paste0(opts$out, ".") else "" # used by add_prefix
pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff)
......@@ -113,7 +122,9 @@ expMatrix = countData %>% column_to_rownames("gene_id") %>% as.matrix
expMatrix = expMatrix[complete.cases(expMatrix),]
group_labels = extract_col(colnames(expMatrix))
group_labels = data_frame(replicate = colnames(expMatrix)) %>% left_join(expDesign) %>% pull(condition)
names(group_labels) = colnames(expMatrix)
makePcaPlot(t(expMatrix), color_by = group_labels, title = "PCA of quantifiable proteins in all conditions")
......@@ -156,6 +167,9 @@ repDist %>% d3heatmap(xaxis_height = 1)
load_pack(limma)
load_pack(edgeR)
#' The used design formula is:
designFormula
orderMatcheExpDesign = data_frame(replicate = colnames(expMatrix)) %>%
mutate(col_index = row_number()) %>%
right_join(expDesign, by = "replicate") %>%
......@@ -163,7 +177,12 @@ orderMatcheExpDesign = data_frame(replicate = colnames(expMatrix)) %>%
## build design matrix
#A key strength of limma’s linear modelling approach, is the ability accommodate arbitrary experimental complexity. Simple designs, such as the one in this workflow, with cell type and batch, through to more complicated factorial designs and models with interaction terms can be handled relatively easily
design <- orderMatcheExpDesign %$% model.matrix(~ 0 + batch + condition)
#designFormula = "condition_2ndwt + batch"
# formula(as.formula(paste("~", designFormula))
# design <- orderMatcheExpDesign %$% model.matrix(~ 0 + batch + condition)
# design <- orderMatcheExpDesign %$% model.matrix(~ 0 + condition)
design <- orderMatcheExpDesign %$% model.matrix(formula(as.formula(paste("~ 0 + ", designFormula))))
rownames(design) <- orderMatcheExpDesign$replicate
#design <- model.matrix(~0+group+lane)
......@@ -177,7 +196,7 @@ exp_study = DGEList(counts = expMatrix, group = orderMatcheExpDesign$condition)
# par(mfrow=c(1,2)) ## 2panel plot for mean-var relationship before and after boom
## Removing heteroscedasticity from count data
#' Removing heteroscedasticity from count data
voomNorm <- voom(exp_study, design, plot = TRUE)
str(voomNorm)
# exp_study$counts is equilvaent to voomNorm$E see https://www.bioconductor.org/help/workflows/RNAseq123/
......@@ -195,7 +214,7 @@ str(voomNorm)
contr.matrix = makeContrasts(contrasts = with(contrasts, paste0("condition", condition_1, "-", "condition", condition_2)), levels = colnames(design))
contr.matrix
#' ## Model Fitting & Moderated t-test
#' Linear modelling in limma is carried out using the lmFit and contrasts.fit functions originally written for application to microarrays. The functions can be used for both microarray and RNA-seq data and fit a separate model to the expression values for each gene. Next, empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability (source: RNAseq123)
vfit <- lmFit(voomNorm, design)
......@@ -221,7 +240,8 @@ summary(dt) %>% as_df %>% kable
#' plot MA per contrast
numContrasts = length(colnames(tfit))
par(mfrow=c(1,numContrasts)) ## 2panel plot for mean-var relationship before and after boom
colnames(tfit) %>% iwalk(~ plotMD(tfit, column=.y, status=dt[,.y], main=.x, xlim=c(-8,13)))
# colnames(tfit) %>% iwalk(~ plotMD(tfit, column=.y, status=dt[,.y], main=.x, xlim=c(-8,13)))
colnames(tfit) %>% iwalk(~ plotMD(tfit, column=.y, status=dt[,.y], main=.x))
# load_pack("Glimma")
......@@ -280,10 +300,10 @@ deResults %>% ggplot(aes(adj_p_val)) + geom_histogram()
#+ results='asis'
if (! is.null(qcutoff)) {
echo("Using q-value cutoff of", qcutoff)
deResults %<>% transform(is_hit = adj_p_val < 0.05)
deResults %<>% transform(is_hit = adj_p_val <= qcutoff)
}else {
echo("Using p-value cutoff of", pcutoff)
deResults %<>% transform(is_hit = adj_p_val < 0.05)
deResults %<>% transform(is_hit = p_value <= pcutoff)
}
#+
......@@ -308,20 +328,22 @@ if (is.null(gene_info_file)) {
mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart = mart) %>%
tbl_df %>% rename(gene_id=ensembl_gene_id)
}) %>% cache_it("geneInfo")
# geneLengths = transmute(geneInfo, gene_id, gene_length = end_position - start_position)
geneDescs = transmute(geneInfo, gene_id, gene_name = external_gene_name, gene_description = description)
} else {
geneInfo = read_tsv(gene_info_file)
colnames(geneInfo)[1] = "gene_id"
geneDescs = geneInfo
}
write_tsv(geneInfo, path = add_prefix("geneInfo.txt"))
# geneLengths = transmute(geneInfo, gene_id, gene_length = end_position - start_position)
geneDescs = transmute(geneInfo, gene_id, gene_name = external_gene_name, gene_description = description)
deAnnot = deResults %>% left_join(geneInfo)
......@@ -348,7 +370,7 @@ deResults %>% ggplot(aes(logfc)) +
facet_grid(condition_1 ~ condition_2) +
geom_vline(xintercept = 0, color = "blue") +
# xlim(-2,2) +
ggtitle("sample1 over sampl2 logFC ")
ggtitle("condition_1 over condition_2 logFC ")
## Extract hits from deseq results
......@@ -489,5 +511,5 @@ voomNorm$E %>% as_df %>% rownames_to_column("gene_id") %>%
writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
session::save.session(".dpa_limma.R.dat");
## session::restore.session(".dpa_limma.R.dat")
session::save.session(".dge_limma.R.dat");
## session::restore.session(".dge_limma.R.dat")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment