Commit 42922181 authored by domingue's avatar domingue
Browse files

New report for gene set enrichment analysis

Closes #92
parent 1dbc8b06
#' # Gene set enrichment analysis with fsgea
#'
#'
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
# ==========================================================================
# Arguments
# ==========================================================================
library("argparser")
# Create a parser
p <- arg_parser("Performs Gene Set Enrichment Analysis (GSEA). It will use the 50 hallmark gene sets from the Molecular Signatures Database (MSigDB)")
# Add command line arguments
p <- add_argument(
p,
"deResults",
help = "Table of results created with featcounts_deseq_mf.R",
type = "character"
)
p <- add_argument(
p,
"--category",
help = "collection, such as H (hallmark), C1 (positional gene sets), C2 (curated gene sets), C3 (regulatory target gene sets), C4 (computational gene sets), C5 (GO gene sets), C6 (oncogenic signatures), C7 ( immunologic signatures). More information in https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp",
type = "character",
default = "H"
)
p <- add_argument(
p,
"--species",
help = "Which species? In the format Mus musculus, Homo sapiens...",
type = "character",
default = "Mus musculus"
)
p <- add_argument(
p,
"--extra_sets",
help = "Gene sets to be analysed in addition to (or instead) those defined in --category. A single column file, one row per set, using the standard name. Example: KEGG_WNT_SIGNALING_PATHWAY. See also https://www.gsea-msigdb.org/gsea/msigdb/cards/KEGG_WNT_SIGNALING_PATHWAY",
type = "character",
default = NA
)
p <- add_argument(
p,
"--qcutoff",
help = "Adjusted p-value threshold to filter significant pathways.",
type = "numeric",
default = 0.01
)
p <- add_argument(
p,
"--out",
help = "Name to prefix all generated result files.",
type = "character"
)
# Parse the command line arguments
argv <- parse_args(p, argv = commandArgs(trailingOnly = TRUE))
de_file <- argv$deResults
category <- argv$category
species <- argv$species
extra_sets <- argv$extra_sets
qcutoff <- argv$qcutoff
out <- argv$out
dir.create("figure", showWarnings = FALSE)
### test arguments
# de_file <- "../de_results.txt"
# category <- "H"
# extra_sets <- "KEGG_APOPTOSIS"
# extra_sets <- "extra_gene_sets.txt"
# qcutoff <- 0.05
# out <- "test"
# ==========================================================================
# Libraries and custom functions
# =========================================================================
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
library("clusterProfiler")
library("msigdbr")
library("fgsea")
library("data.table")
library("org.Mm.eg.db")
set.seed(42)
## not present in my version of the package
## taken from: https://rdrr.io/github/ctlab/fgsea/src/R/pathways.R
mapIdsList <- function(x, keys, column, keytype, ...) {
keysFlat <- unlist(keys)
keysUnique <- unique(keysFlat)
ansUnique <- AnnotationDbi::mapIds(x = x, keys = keysUnique, column = column, keytype = keytype, ...)
ansFlat <- ansUnique[keysFlat]
ans <- split(ansFlat, rep(seq_along(keys), sapply(keys, length)))
ans
}
# ==========================================================================
#' ## Read data and prepare variables
# ==========================================================================
#' Run configuration was:
vec_as_df(unlist(argv)) %>%
filter(!name %in% c("opts", "help", "")) %>%
kable()
resultsBase <- if (!is.na(out)) {
paste0(out, ".")
} else {
""
}
#
#' ## DGE results
# --------------------------------------------------------------------------
#' For GSEA all genes used as input for DGE analysis are used. However a few steps need to be done to prepare data for GSEA:
#'
#' - genes are ranked by their adjusted values and logFC signal, that is, genes with the lowest p-value are put at the beginning or the end of the list if they down- or up-regulated respectively
#' - ensemble gene IDs have to be converted to ENTREZ gene IDs
#' - the step above might generate duplicate entries which are removed
gene_df <- read_tsv(de_file) %>%
transmute(
ensembl_gene_id,
contrast = paste(condition_1, "vs", condition_2),
logfc = c1_over_c2_logfc,
padj,
rank_score = -log10(padj)
) %>%
mutate(rank_score = ifelse(logfc < 0, rank_score * -1, rank_score))
eg <- bitr(gene_df$ensembl_gene_id, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = "org.Mm.eg.db") %>%
distinct_all(ENTREZID) %>%
distinct_all(ENSEMBL)
gene_df <- gene_df %>%
left_join(eg, by = c("ensembl_gene_id" = "ENSEMBL"))
#
# create ranks based on fold-change
# --------------------------------------------------------------------------
gseaDat <- gene_df %>%
filter(!is.na(ENTREZID)) %>%
arrange(logfc)
ranks <- gseaDat$rank_score
names(ranks) <- gseaDat$ENTREZID
# length(ranks)
# length(unique(ranks))
## debug
# gseaDat %>% filter(duplicated(logfc))
# gseaDat %>% filter(duplicated(ensembl_gene_id))
# gseaDat %>% filter(ENTREZID == "320292")
# gseaDat %>% filter(ensembl_gene_id == "ENSMUSG00000056629")
#' After data preparation `r length(ranks)` are used for analysis, from the original `r length(unique(gene_df$ensembl_gene_id))`.
# ==========================================================================
#' ## GSEA
# ==========================================================================
#' The goal of GSEA is to determine whether members of a gene set S tend to occur toward the top (or bottom) of the list L, in which case the gene set is correlated with the phenotypic class distinction.
#'
#' For this analysis we retrieved the hallmark gene sets from MSigDB for mouse. The MSigDB version used was `r packageVersion("msigdbr")`.
#'
#' The Molecular Signatures Database (MSigDB) is a collection of gene sets originally created for use with the Gene Set Enrichment Analysis (GSEA) software.
#'
#' Gene homologs are provided by HUGO Gene Nomenclature Committee at the European Bioinformatics Institute which integrates the orthology assertions predicted for human genes by eggNOG, Ensembl Compara, HGNC, HomoloGene, Inparanoid, NCBI Gene Orthology, OMA, OrthoDB, OrthoMCL, Panther, PhylomeDB, TreeFam and ZFIN. For each human equivalent within each species, only the ortholog supported by the largest number of databases is used.
## Retrieve mouse genes for just the hallmark collection gene sets.
m_df <- msigdbr(species = species, category = category)
if(!is.na(extra_sets)) {
extra_pathways <- fread(extra_sets)[[1]]
extra_df <- msigdbr(species = species) %>%
filter(gs_name %in% extra_pathways)
m_df <- m_df %>%
bind_rows(extra_df)
}
head(m_df)
## actual GSEA command
m_list <- m_df %>% split(x = .$entrez_gene, f = .$gs_name)
fgseaRes <- fgsea(m_list, ranks, minSize = 15, maxSize = 500, nperm = 10000)
# ==========================================================================
# Plots
# ==========================================================================
#' For a gene set to be considered enriched we used a p-adjusted value cutoff of `r qcutoff`. The plots are for those sets that passed the cut-off.
enriched_pathways <- fgseaRes[padj < qcutoff][["pathway"]]
clean_ranks <- ranks[!is.na(ranks)] # needed for plotting, NA cause error
for (pathway in enriched_pathways) {
p1 <- plotEnrichment(m_list[[pathway]], clean_ranks) +
labs(title = pathway)
print(p1)
ggsave(paste0("figure/", resultsBase, pathway, ".enrichment_plot.pdf"))
}
#
# tables of results
# --------------------------------------------------------------------------
#' Since the genes were ranked according to their differential expression significance and fold change, with the most significantly down-regulated genes at the top and up-regulated genes at the bottom of the list, the enriched gene sets provides us with some idea of the function of these genes. We can vislualize that in the table:
#'
dev.off()
topPathwaysUp <- fgseaRes[ES > 0 & padj < qcutoff][order(pval), pathway]
topPathwaysDown <- fgseaRes[ES < 0 & padj < qcutoff][order(pval), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
p_table <- plotGseaTable(
m_list[topPathways],
clean_ranks,
fgseaRes,
gseaParam = 0.5,
render = FALSE
)
#+ fig.width=7, fig.height=8
grid.draw(p_table)
#' Often it is useful to extract the core members of high scoring gene sets that contribute to the enrichment score. As you may have noticed from the previous section, one of the results column was named “leadingEdge”; this contains the genes that contributed to the enrichment score.
## save table
setDT(fgseaRes)
fgseaRes[, leadingEdgeGeneSymbol := mapIdsList(
x = org.Mm.eg.db,
keys = leadingEdge,
keytype = "ENTREZID",
column = "SYMBOL"
)]
fgseaRes[, leadingEdgeEnsemblID := mapIdsList(
x = org.Mm.eg.db,
keys = leadingEdge,
keytype = "ENTREZID",
column = "ENSEMBL"
)]
fwrite(fgseaRes, file = paste0(resultsBase, "fgseaRes.txt"), sep = "\t", sep2 = c("", " ", ""))
fgseaRes %>%
.[,c("pathway", "pval", "padj", "ES", "NES", "size", "leadingEdgeGeneSymbol")] %>%
datatable(escape = F)
# ==========================================================================
# References
# ==========================================================================
#' - https://bioinformatics-core-shared-training.github.io/cruk-summer-school-2018/RNASeq2018/html/06_Gene_set_testing.nb.html#gsea-analysis
#' - https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html
#' - https://davetang.org/muse/2018/01/10/using-fast-preranked-gene-set-enrichment-analysis-fgsea-package/
#' - https://www.biorxiv.org/content/10.1101/060012v2
# ==========================================================================
# Session info
# ==========================================================================
session::save.session(".gsea.dat")
# session::restore.session(".gsea.dat")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment