From 2d1b1fe3823b3c4bfa5e3f6586b3de95e734a954 Mon Sep 17 00:00:00 2001 From: Holger Brandl <holgerbrandl@gmail.com> Date: Wed, 6 Jul 2016 16:46:14 +0200 Subject: [PATCH] started gsea enrichment tool --- common/gsea_enrichment.R | 359 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) create mode 100755 common/gsea_enrichment.R diff --git a/common/gsea_enrichment.R b/common/gsea_enrichment.R new file mode 100755 index 0000000..425e811 --- /dev/null +++ b/common/gsea_enrichment.R @@ -0,0 +1,359 @@ +#!/usr/bin/env Rscript +#+ echo=FALSE, error=F + + +# cd + +suppressMessages(require(docopt)) + +## todo use textual input here for ease of use +doc <- ' +Perform an enrichment analysis for a set of genes +Usage:gsea_enrichment.R[options] <sorted_gene_lists_tsv> <group_col> + +Options: +--list_id_col Column containing the grouping variable to speparate different lists [default: ] +--project <project_prefix> Name to prefix all generated result files [default: ] +--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01] + +' + +opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining +# opts <- docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" ) + +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.27/R/core_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.27/R/ggplot_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.27/R/bio/diffex_commons.R") + +loadpack(knitr) +loadpack(DT) + +#loadpack(clusterProfiler) + +#devtools::session_info() # nice! + +## to fix child support issue with knitr, see also +## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents +## https://github.com/yihui/knitr/issues/38 +# todo disabled because root.dir in parent document seems the only working solution +#if(exists('project_dir')) setwd(project_dir) +#print(getwd()) + +## load the data +geneLists <- read_tsv(opts$sorted_gene_lists_tsv) +group_col=opts$group_col +geneLists %<>% group_by_(.dots=group_col) + +#geneLists %<>% filter(cluster %in% c("cluster_1", "cluster_2")) + +## TODO expose options to run gesa instead (by assuming that gene lists are sorted) +## see http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.pdf for details + +resultsBaseName <- if(str_length(opts$project)>0) paste0(opts$project, ".") else basename(opts$sorted_gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".") +#resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".") + +qCutoff <- as.numeric(opts$qcutoff) + + +#reload_dplyr() + +######################################################################################################################## +#' ## Enrichment Analysis + +#' Run configuration was +vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable() + +#' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose, + +listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct +listLabels %<>% transform(list_label=do.call(paste, c(listLabels, sep="__"))) + +geneLists %>% inner_join(listLabels) %>% + ggplot(aes(list_label)) + + geom_bar() + + coord_flip() + + ggtitle("gene list sizes to be tested for term enrichment") + + ylab("") + + +## todo move to diffex commons +guess_cp_species <- function(ensIds){ + an_id <-ensIds[1] + + if(str_detect(an_id, "ENSG")){ + return("human") + }else if(str_detect(an_id, "ENSMUSG")){ + return("mouse") + }else if(str_detect(an_id, "ENSDARG")){ + return("zebrafish") + }else if(str_detect(an_id, "FBgn")){ + return("fly") + }else{ + stop(paste("could not clusterProfiler species name from ", an_id)) + } +} + +guess_anno_db <- function(ensIds){ + an_id <-ensIds[1] + + if(str_detect(an_id, "ENSG")){ + return("org.Hs.eg.db") + }else if(str_detect(an_id, "ENSMUSG")){ + return("org.Mm.eg.db") + }else if(str_detect(an_id, "ENSDARG")){ + return("org.Dr.eg.db") + }else if(str_detect(an_id, "FBgn")){ + return("org.Dm.eg.db") + }else{ + stop(paste("could not anno db mart from ", an_id)) + } +} + +#source("http://bioconductor.org/biocLite.R") +#biocLite("org.Mm.eg.db") +#biocLite("org.Hs.eg.db") +#biocLite("org.Dr.eg.db") +#biocLite("org.Dm.eg.db") +#biocLite("KEGG.db") + + +#data(gcSample) +#yy = enrichKEGG(gcSample[[5]], pvalueCutoff=0.01) +#head(summary(yy)) +#plot(yy) +## seems broken + +cpSpecies <- guess_cp_species(geneLists$ensembl_gene_id) +annoDb <- guess_anno_db(geneLists$ensembl_gene_id) # e.g. "org.Hs.eg.db" + +## supported ids +#idType("org.Hs.eg.db") + +## convert to entrez gene ids + +glMappedRaw <- clusterProfiler::bitr(geneLists$ensembl_gene_id, fromType="ENSEMBL", toType="ENTREZID", OrgDb=annoDb) %>% + set_names("ensembl_gene_id", "entrez_gene_id") %>% right_join(geneLists) + + + +#' Check how many failed to map +count(glMappedRaw, is.na(entrez_gene_id)) + +unloadNamespace('clusterProfiler') +#loadpack(clusterProfiler) + +reload_dplyr() + +glMapped <- glMappedRaw %>% filter(!is.na(entrez_gene_id)) %>% select(-ensembl_gene_id) %>% + ## regroup the data + # group_by_(cluster) + group_by_(.dots=group_col %>% ac) + + +## retain just 1500 genes at max per group +#glMappedSub <- glMapped %>% do({shuffle(.) %>% head(1500)}) +#count(glMappedSub, cluster) + +## sync reactome pacakge to node +#if(Sys.getenv("LSF_SERVERDIR")!="" && dir.exists("/tmp/local_r_packages")){ +# system("if [ ! -d '/tmp/local_r_packages/reactome.db/' ]; then scp -r falcon:/tmp/local_r_packages /tmp ; fi") +#} + +loadpack(ReactomePA) + +cp_test <- function(geneIds){ + # DEBUG geneIds <- glMapped %>% filter(cluster %in% c("cluster_9")) %$% entrez_gene_id %>% as.integer + # DEBUG geneIds <- head(glMapped,30)$entrez_gene_id +# geneIds=.$entrez_gene_id + + if(length(geneIds)>1500){ + geneIds <- sample(geneIds) %>% head(1500) + } + + echo("testing", length(geneIds), " genes for enrichment") + +# PANTHER10_ontology <- read.delim("http://data.pantherdb.org/PANTHER10.0/ontology/Protein_Class_7.0") + +# browser() +# pantherResults <- enricher(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, TERM2GENE = PANTHER10_ontology) %>% summary() + keggResults <- clusterProfiler::enrichKEGG(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, use_internal_data=T) %>% summary() + reactomeResults <- enrichPathway(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff) %>% summary() + goResultsCC <- clusterProfiler::enrichGO(gene = geneIds, OrgDb = annoDb, qvalueCutoff = qCutoff, ont = "CC") %>% summary() + goResultsMF <- clusterProfiler::enrichGO(gene = geneIds, OrgDb = annoDb, qvalueCutoff = qCutoff, ont = "MF") %>% summary() + goResultsBP <- clusterProfiler::enrichGO(gene = geneIds, OrgDb = annoDb, qvalueCutoff = qCutoff, ont = "BP") %>% summary() + + #cp-bug: if no pathways are enriched odd strucuture is retured ##todo file issue + if(!("data.frame" %in% class(keggResults))) keggResults <- filter(goResultsBP, Description="foobar") + + enrResults <- rbind_list( + mutate(keggResults, ontology="kegg"), + mutate(reactomeResults, ontology="reactome"), + mutate(goResultsBP, ontology="go_bp"), + mutate(goResultsMF, ontology="go_mf"), + mutate(goResultsCC, ontology="go_cc") + ) + enrResults +# echo("numResults", nrow(enrResults)) +} + +## prety slow +enrResults <- quote(glMapped %>% do(cp_test(.$entrez_gene_id))) %>% cache_it(paste0("enrdata_", digest(glMapped))) + +## run the actual enrichment test for all clusters and all ontologies +#library(foreach); library(doMC); registerDoMC(cores=20) +## https://support.bioconductor.org/p/38541/ +## library(GOstats) inside your %dopar% loop. And then start with a fresh session, so GOstats is not loaded outside the loop. It worked for me. +#enrResults <- dlply(glMapped, groups(glMapped) %>% ac, function(curGroup){ +# cp_test(curGroup$entrez_gene_id) +#}, .progress="text", .parallel=F) ##%>% cache_it(paste0("enrdata_", digest(glMapped))) +#rbind_all(enrResults) + +#reload_dplyr() + +#enrResults %<>% rename(Category=Description) +#enrResults %<>% rename(ontology=Category) + +## remove to clumsy gene_id columns +enrResults %<>% select(-geneID) + +write_tsv(enrResults, path=paste0(resultsBaseName, "enrResults.txt")) +# enrResults <- read.delim(paste0(resultsBaseName, "enrResults.txt")) +#' [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`) + +loadpack(DT) +datatable(enrResults) + +#enrResults %>% ggplot(aes(pvalue)) + geom_histogram() + scale_x_log10() +#enrResults %>% ggplot(aes(ontology)) + facet_wrap(~cluster) + geom_bar() + rot_x_lab() +#facetSpecs <- paste("~", groups(geneLists) %>% ac %>% paste(collapse=" + ")) +facetSpecs <- paste("~", group_col %>% ac %>% paste(collapse=" + ")) + +#' Visualize term-pvalues per list + +#http://stackoverflow.com/questions/11028353/passing-string-variable-facet-wrap-in-ggplot-using-r +enrResults %>% ggplot(aes(ontology)) + facet_wrap(as.formula(facetSpecs), ncol=3) + geom_bar() + rot_x_lab() + ggtitle("enriched term counts by cluster") + +enrResults %<>% mutate(num_term_genes=str_split_fixed(BgRatio, fixed("/"),2)[,1] %>% as.numeric) + +#' Keep at max 100 terms for visualzation per group +#if(table(enrResults$cluster)) +## restablish the grouping and limit results per group to 100 +#enrResults %<>% group_by_(.dots=groups(geneLists) %>% ac) +#enrResults %<>% group_by_(.dots=group_col) +#enrResults %<>% sample_frac(1) %>% filter((row_number()<100)) %>% group_by_(.dots=group_col) + +#tt <- enrResults %>% group_by_(.dots=c(group_col,"ontology")) %>% arrange(-qvalue) %>% dplyr::slice(1:10) +#enrResults %>% filter(cluster=="cluster_1") +#enrResults %>% group_by_(.dots=c(group_col)) %>% ggplot(aes(pvalue, qvalue, color=ontology))+ geom_point() +#enrResults %>% group_by_(.dots=c(group_col)) %>% ggplot(aes(pvalue, p.adjust, color=cluster))+ geom_point() +erPlotData <- enrResults %>% group_by_(.dots=c(group_col)) %>% arrange(qvalue) %>% slice(1:15) %>% + ## regroup because otherwise dplyr complains about corrupt df (which looks like a bug) + group_by_(.dots=c(group_col)) + +#count(erPlotData, cluster) + + + +#+ include=FALSE, eval=FALSE +## plot the enrichment results +#sigEnrResults %>% group_by(Category, add=T) %>% do({ +# logPlot <- . %>% ggplot(aes(Term, PValue)) + +# geom_bar(stat="identity")+coord_flip() + +# xlab("Enriched Terms") + +# ggtitle(.$Category[1]) + +# scale_y_log10() +# print(logPlot) +#}) + +#sigEnrResults %>% +## select(-Genes) %>% +# do({ +# enrResultsGrp <- . +# ## DEBUG enrResultsGrp <- sigEnrResults +## geneLists %>% first(1) %>% select(-ensembl_gene_id) %>% paste0(., collapse="_") +##browser() +# label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_") +# +# echo("processing", label) +# +# logPlot <- enrResultsGrp %>% +# ## fix factor order +# mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>% +# ggplot(aes(Term, PValue, fill=Category)) + +# geom_bar(stat="identity")+ +# coord_flip() + +# xlab("Enriched Terms") + +# ggtitle(label) + +# scale_y_log10() +# +# ggsave(paste0(resultsBaseName, label, ".enrichmed_terms.pdf")) +# print(logPlot) +#}) +##ggsave2() + + +#+ error=TRUE, echo=FALSE +warning("dropping levels") +erPlotData %<>% mutate(ontology=ac(ontology)) ## drop unsused level to get consistent color palette + +erPlotData %<>% rename(Term=Description) +term_category_colors <- create_palette(unique(ac(erPlotData$ontology))) + +figDir <- "enr_charts" +dir.create(figDir) + +term_barplot_files <- erPlotData %>% + ## chop and pad category names + mutate(Term=str_sub(Term, 1, 70) %>% str_pad(70)) %>% +do({ + enrResultsGrp <- . + + ## DEBUG enrResultsGrp <- erPlotData %>% ungroup() %>% filter(contrast==contrast[1]) +# with(enrResultsGrp, as.data.frame(table(contrast))) + +# browser() + ## DEBUG enrResultsGrp <- sigEnrResults +# label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_") + label <- subset(enrResultsGrp, select=group_col)[1,1] %>% as.matrix %>% ac + + +#stopifnot(all(!is.na(enrResultsGrp$Term))) +#stopifnot(all(!is.na(enrResultsGrp$ontology))) +#stopifnot(all(!is.na(enrResultsGrp$num_term_genes))) + + #browser() +# warning(paste0("processing terms", paste(ac(unique(enrResultsGrp$ontology)), collapse=","))) + logPlot <- enrResultsGrp %>% + ## fix factor order +# mutate(Term=reorder(Term, -qvalue) %>% reorder(as.integer(as.factor(ontology)))) %>% + mutate(Term=reorder(Term, -qvalue)) %>% + ggplot(aes(Term, num_term_genes, fill=-log10(qvalue), color=ontology)) + + geom_bar(stat="identity") + + scale_color_manual(values = term_category_colors, drop=F, name="Ontology") + + coord_flip() + + xlab("Enriched Terms") + + ggtitle(label) + + scale_y_log10() + + # print(logPlot) + + ## todo use builtin method to create filesystem-compatible name + fileNameLabel <- label %>% + str_replace_all("!=", "ne") %>% + str_replace_all(">", "gt") %>% + str_replace_all("<", "lt") %>% + str_replace_all(fixed("&"), "AND") %>% + str_replace_all(fixed("|"), "OR") %>% + str_replace_all(" ", "_") + +# ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf")) +# print(logPlot) + + stopifnot(str_length(fileNameLabel)>0) + tmpPng <- paste0(figDir, "/enrterms__", fileNameLabel, ".png") + ggsave(tmpPng, logPlot, width=10, height = 2+round(nrow(enrResultsGrp)/5), limitsize=FALSE) + data.frame(file=tmpPng) +}) + +#+ results="asis" +l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))}) -- GitLab