Commit c93f3790 authored by domingue's avatar domingue
Browse files

Major fixes

- added several test to confirm list selection
- it is now possible to use only custom gene list. Before they were always added to an Hallmark gene list.
- changed variable name to fit coding style
- added new plot for an overview of enriched sets

Closes #92
parent 42922181
......@@ -24,7 +24,7 @@ p <- add_argument(
p <- add_argument(
p,
"--category",
help = "collection, such as H (hallmark), C1 (positional gene sets), C2 (curated gene sets), C3 (regulatory target gene sets), C4 (computational gene sets), C5 (GO gene sets), C6 (oncogenic signatures), C7 ( immunologic signatures). More information in https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp",
help = "collection, such as H (hallmark), C1 (positional gene sets), C2 (curated gene sets), C3 (regulatory target gene sets), C4 (computational gene sets), C5 (GO gene sets), C6 (oncogenic signatures), C7 ( immunologic signatures). Set to _None_ if only specific gene sets from --extra_sets are going to be used. More information in https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp",
type = "character",
default = "H"
)
......@@ -69,11 +69,15 @@ extra_sets <- argv$extra_sets
qcutoff <- argv$qcutoff
out <- argv$out
dir.create("figure", showWarnings = FALSE)
### test arguments
# de_file <- "../de_results.txt"
# de_file <- file.path(Sys.getenv("PRJ_DATA"),"/data/alexaki/de_results.txt")
# category <- "None"
# category <- "H"
# extra_sets <- "KEGG_APOPTOSIS"
# extra_sets <- "extra_gene_sets.txt"
# extra_sets <- "extra_sets.txt"
# species <- "Mus musculus"
# qcutoff <- 0.05
# out <- "test"
......@@ -98,10 +102,26 @@ mapIdsList <- function(x, keys, column, keytype, ...) {
ans <- split(ansFlat, rep(seq_along(keys), sapply(keys, length)))
ans
}
## The version in conda doesn't have an option need for plotting, so I sourcing it directly
## If we move to package version >= 1.15 this can be removed
library("gridExtra")
devtools::source_url("https://raw.githubusercontent.com/ctlab/fgsea/33dd9f3c60b3cee0200be7a1c8b9e68446068a9c/R/plot.R")
category_exists <- function(category, species){
all_sig <- msigdbr(species = species)
if (!category %in% c(all_sig$gs_name, all_sig$gs_cat, all_sig$gs_subcat)){
FALSE
}
TRUE
}
# ==========================================================================
#' ## Read data and prepare variables
# ==========================================================================
if (category == "None" & is.na(extra_sets)) {
stop("One of --category or --extra_sets must be used.")
}
#' Run configuration was:
vec_as_df(unlist(argv)) %>%
filter(!name %in% c("opts", "help", "")) %>%
......@@ -142,20 +162,20 @@ gene_df <- gene_df %>%
#
# create ranks based on fold-change
# --------------------------------------------------------------------------
gseaDat <- gene_df %>%
gsea_dat <- gene_df %>%
filter(!is.na(ENTREZID)) %>%
arrange(logfc)
ranks <- gseaDat$rank_score
names(ranks) <- gseaDat$ENTREZID
ranks <- gsea_dat$rank_score
names(ranks) <- gsea_dat$ENTREZID
# length(ranks)
# length(unique(ranks))
## debug
# gseaDat %>% filter(duplicated(logfc))
# gseaDat %>% filter(duplicated(ensembl_gene_id))
# gseaDat %>% filter(ENTREZID == "320292")
# gseaDat %>% filter(ensembl_gene_id == "ENSMUSG00000056629")
# gsea_dat %>% filter(duplicated(logfc))
# gsea_dat %>% filter(duplicated(ensembl_gene_id))
# gsea_dat %>% filter(ENTREZID == "320292")
# gsea_dat %>% filter(ensembl_gene_id == "ENSMUSG00000056629")
#' After data preparation `r length(ranks)` are used for analysis, from the original `r length(unique(gene_df$ensembl_gene_id))`.
......@@ -170,16 +190,27 @@ names(ranks) <- gseaDat$ENTREZID
#'
#' Gene homologs are provided by HUGO Gene Nomenclature Committee at the European Bioinformatics Institute which integrates the orthology assertions predicted for human genes by eggNOG, Ensembl Compara, HGNC, HomoloGene, Inparanoid, NCBI Gene Orthology, OMA, OrthoDB, OrthoMCL, Panther, PhylomeDB, TreeFam and ZFIN. For each human equivalent within each species, only the ortholog supported by the largest number of databases is used.
## Retrieve mouse genes for just the hallmark collection gene sets.
m_df <- msigdbr(species = species, category = category)
## Build the df with all the gene sets to be tested
## Make sure category exists.
if (category != "None" & category_exists(category, species)) {
m_df <- msigdbr(species = species, category = category)
}
if(!is.na(extra_sets)) {
if (!is.na(extra_sets)) {
extra_pathways <- fread(extra_sets)[[1]]
extra_df <- msigdbr(species = species) %>%
filter(gs_name %in% extra_pathways)
filter(gs_name %in% extra_pathways)
if(nrow(extra_df) == 0){
stop("None of the extra genes sets exists for the selected species.")
}
if(exists("m_df")){
m_df <- m_df %>%
bind_rows(extra_df)
} else {
m_df <- extra_df
}
}
head(m_df)
......@@ -195,6 +226,21 @@ fgseaRes <- fgsea(m_list, ranks, minSize = 15, maxSize = 500, nperm = 10000)
enriched_pathways <- fgseaRes[padj < qcutoff][["pathway"]]
clean_ranks <- ranks[!is.na(ranks)] # needed for plotting, NA cause error
#' We have have an overview of the pathway enrichment by plotting the normalized enrichment score (NES) and p-adjusted values for each of the enriched pathways. A positive NES indicates gene set enrichment at the top of the ranked list; a negative NES indicates gene set enrichment at the bottom of the ranked list.
#' Reference: https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html
fgseaRes[padj < qcutoff] %>%
mutate(pathway = fct_reorder(pathway, NES)) %>%
arrange(-padj) %>%
ggplot(aes(x = pathway, y = NES, fill = padj)) +
geom_col() +
labs(
x = "Enrichment score",
y = ""
) +
coord_flip() +
theme_light()
for (pathway in enriched_pathways) {
p1 <- plotEnrichment(m_list[[pathway]], clean_ranks) +
labs(title = pathway)
......@@ -243,7 +289,7 @@ fgseaRes[, leadingEdgeEnsemblID := mapIdsList(
fwrite(fgseaRes, file = paste0(resultsBase, "fgseaRes.txt"), sep = "\t", sep2 = c("", " ", ""))
fgseaRes %>%
.[,c("pathway", "pval", "padj", "ES", "NES", "size", "leadingEdgeGeneSymbol")] %>%
.[, c("pathway", "pval", "padj", "ES", "NES", "size", "leadingEdgeGeneSymbol")] %>%
datatable(escape = F)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment