Commit 11af63fd authored by domingue's avatar domingue
Browse files

Produce results by contrast

- loops over contrasts
- added test arguments for set size
- saves fgsea graphical table and sources it to markdown
- results table contains contrasts

Closes #96
parent 0146ff8b
......@@ -107,6 +107,8 @@ dir.create("figure", showWarnings = FALSE)
# gmt_file <- "example.gmt"
# species <- "Mus musculus"
# qcutoff <- 0.05
# minSize <- 15
# maxSize <- 500
# out <- "test"
# ==========================================================================
......@@ -130,6 +132,22 @@ mapIdsList <- function(x, keys, column, keytype, ...) {
ans <- split(ansFlat, rep(seq_along(keys), sapply(keys, length)))
ans
}
clean_names <- function(names, sep=".", replace.all=TRUE, is.valid=FALSE){
clean_name <- base::make.names(names)
if (replace.all == TRUE){
clean_name <- gsub("\\.{1,}", sep, clean_name)
}
if (hasArg(sep) & sep != "."){
clean_name <- gsub("\\.", sep, clean_name)
}
return(clean_name)
}
## The version in conda doesn't have an option need for plotting, so I sourcing it directly
## If we move to package version >= 1.15 this can be removed
library("gridExtra")
......@@ -195,10 +213,19 @@ gene_df <- gene_df %>%
# --------------------------------------------------------------------------
gsea_dat <- gene_df %>%
filter(!is.na(ENTREZID)) %>%
arrange(logfc)
arrange(contrast, logfc)
df_to_named_vec <- function(df, values_col, names_col){
vec <- df[[values_col]]
names(vec) <- df[[names_col]]
return(vec)
}
ranks_l <- gsea_dat %>%
setDT() %>%
split(by = "contrast") %>%
map(df_to_named_vec, values_col = "logfc", names_col = "ENTREZID")
ranks <- gsea_dat$rank_score
names(ranks) <- gsea_dat$ENTREZID
# length(ranks)
# length(unique(ranks))
......@@ -208,7 +235,6 @@ names(ranks) <- gsea_dat$ENTREZID
# gsea_dat %>% filter(ENTREZID == "320292")
# gsea_dat %>% filter(ensembl_gene_id == "ENSMUSG00000056629")
#' After data preparation `r length(ranks)` are used for analysis, from the original `r length(unique(gene_df$ensembl_gene_id))`.
# ==========================================================================
#' ## GSEA
......@@ -259,79 +285,111 @@ if (!is.na(gmt_file)) {
## actual GSEA command
fgseaRes <- fgsea(m_list, ranks, minSize = minSize, maxSize = maxSize, nperm = 10000)
fgseaRes_l <- ranks_l %>%
map(~fgsea(m_list, ., minSize = minSize, maxSize = maxSize, nperm = 10000))
# fgseaRes <- fgsea(m_list, ranks, minSize = minSize, maxSize = maxSize, nperm = 10000)
# ==========================================================================
# Plots
# ==========================================================================
#' For a gene set to be considered enriched we used a p-adjusted value cutoff of `r qcutoff`. The plots are for those sets that passed the cut-off.
enriched_pathways <- fgseaRes[padj < qcutoff][["pathway"]]
clean_ranks <- ranks[!is.na(ranks)] # needed for plotting, NA cause error
## if too many pathways are enriched, only top 20 should be plotted in the report,otherwise it's a mess.
if (length(enriched_pathways) > 20) {
msg <- paste0(length(enriched_pathways), " were found significantly enriched for a p-adjust value of ", qcutoff, ".In this report only the top 20 will be show for visualization reasons. Enrichment plots for all enriched pathways can be found in the folder figures/")
top_pathways <- fgseaRes[padj < qcutoff] %>%
arrange(padj) %>%
head(20) %>%
setDT()
} else {
top_pathways <- fgseaRes[padj < qcutoff]
msg <- paste0(length(enriched_pathways), " were found significantly enriched for a p-adjust value of ", qcutoff, ".")
}
#' `r msg`
#'
#' We can have an overview of the pathway enrichment by plotting the normalized enrichment score (NES) and p-adjusted values for each of the top enriched pathways. A positive NES indicates gene set enrichment at the top of the ranked list; a negative NES indicates gene set enrichment at the bottom of the ranked list.
#' Reference: https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html
#'
#+ results='asis'
top_pathways_l <- list()
for (contrast in names(fgseaRes_l)){
cat("\n\n### Comparison", contrast, "\n")
fgseaRes <- fgseaRes_l[[contrast]]
ranks <- ranks_l[[contrast]]
enriched_pathways <- fgseaRes[padj < qcutoff][["pathway"]]
clean_ranks <- ranks[!is.na(ranks)] # needed for plotting, NA cause error
cat("\nAfter data preparation ", length(ranks), "are used for analysis, from the original", length(unique(gene_df$ensembl_gene_id)), "\n")
## if too many pathways are enriched, only top 20 should be plotted in the report,otherwise it's a mess.
if (length(enriched_pathways) > 20) {
msg <- paste0(length(enriched_pathways), " pathways found significantly enriched for a p-adjust value of ", qcutoff, ".In this report only the top 20 will be show for visualization reasons. Enrichment plots for all enriched pathways can be found in the folder figures/")
top_pathways <- fgseaRes[padj < qcutoff] %>%
arrange(padj) %>%
head(20) %>%
setDT()
} else {
top_pathways <- fgseaRes[padj < qcutoff]
msg <- paste0(length(enriched_pathways), " were found significantly enriched for a p-adjust value of ", qcutoff, ".")
}
top_pathways %>%
mutate(pathway = fct_reorder(pathway, NES)) %>%
arrange(-padj) %>%
ggplot(aes(x = pathway, y = NES, fill = padj)) +
geom_col() +
labs(
x = "Enrichment score",
y = ""
) +
coord_flip() +
theme_light()
for (pathway in top_pathways[["pathway"]]) {
p1 <- plotEnrichment(m_list[[pathway]], clean_ranks) +
labs(title = pathway)
print(p1)
ggsave(paste0("figure/", resultsBase, pathway, ".enrichment_plot.pdf"))
}
top_pathways_l[[contrast]] <- top_pathways
cat("\n", msg, "\n")
top_pathways %>%
mutate(pathway = fct_reorder(pathway, NES)) %>%
arrange(-padj) %>%
ggplot(aes(x = pathway, y = NES, fill = padj)) +
geom_col() +
labs(
x = "Enrichment score",
y = ""
) +
coord_flip() +
theme_light()
for (pathway in top_pathways[["pathway"]]) {
p1 <- plotEnrichment(m_list[[pathway]], clean_ranks) +
labs(title = pathway)
print(p1)
ggsave(paste0("figure/", resultsBase, pathway, ".", clean_names(contrast, sep = "_"), ".enrichment_plot.pdf"))
}
#
# tables of results
# --------------------------------------------------------------------------
#' Since the genes were ranked according to their differential expression significance and fold change, with the most significantly down-regulated genes at the top and up-regulated genes at the bottom of the list, the enriched gene sets provides us with some idea of the function of these genes. We can visualize that in the table:
#'
dev.off()
topPathwaysUp <- top_pathways[ES > 0 & padj < qcutoff][order(pval), pathway]
topPathwaysDown <- top_pathways[ES < 0 & padj < qcutoff][order(pval), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
dev.off()
topPathwaysUp <- top_pathways[ES > 0 & padj < qcutoff][order(pval), pathway]
topPathwaysDown <- top_pathways[ES < 0 & padj < qcutoff][order(pval), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
p_table <- plotGseaTable(
p_table <- plotGseaTable(
m_list[topPathways],
clean_ranks,
fgseaRes,
gseaParam = 0.5,
render = FALSE
)
#+ fig.width=9, fig.height=8
grid.draw(p_table)
)
fig_path <- paste0("figure/", resultsBase, pathway, ".", clean_names(contrast, sep = "_"), ".table.png")
png(fig_path, width = 1500, height = 1500)
print(grid.draw(p_table))
cat("\n![](", fig_path, ")\n")
dev.off()
}
#
# tables of results
# --------------------------------------------------------------------------
#' Since the genes were ranked according to their differential expression significance and fold change, with the most significantly down-regulated genes at the top and up-regulated genes at the bottom of the list, the enriched gene sets provides us with some idea of the function of these genes. We can visualize that in the table:
#'
# dev.off()
# topPathwaysUp <- top_pathways[ES > 0 & padj < qcutoff][order(pval), pathway]
# topPathwaysDown <- top_pathways[ES < 0 & padj < qcutoff][order(pval), pathway]
# topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
# p_table <- plotGseaTable(
# m_list[topPathways],
# clean_ranks,
# fgseaRes,
# gseaParam = 0.5,
# render = FALSE
# )
# #+ fig.width=9, fig.height=8
# grid.draw(p_table)
#' Often it is useful to extract the core members of high scoring gene sets that contribute to the enrichment score. As you may have noticed from the previous section, one of the results column was named “leadingEdge”; this contains the genes that contributed to the enrichment score.
## save table
fgseaRes <- bind_rows(fgseaRes_l, .id = "contrast")
setDT(fgseaRes)
fgseaRes[, leadingEdgeGeneSymbol := mapIdsList(
x = eval(parse(text = species_db)),
......@@ -348,7 +406,7 @@ fgseaRes[, leadingEdgeEnsemblID := mapIdsList(
fwrite(fgseaRes, file = paste0(resultsBase, "fgseaRes.txt"), sep = "\t", sep2 = c("", " ", ""))
fgseaRes %>%
.[, c("pathway", "pval", "padj", "ES", "NES", "size", "leadingEdgeGeneSymbol")] %>%
.[, c("contrast", "pathway", "pval", "padj", "ES", "NES", "size", "leadingEdgeGeneSymbol")] %>%
datatable(escape = F)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment