gene_feature_enrichment.R 9.28 KB
Newer Older
1 2 3 4 5 6 7
#!/usr/bin/env Rscript
#+ echo=FALSE, error=F


suppressMessages(require(docopt))

## todo use textual input here for ease of use
8
doc = '
9
Perform a gene feature enrichment analysis for a set of genes
10 11 12 13 14 15 16 17
Usage: cp_enrichment.R [options] <gene_lists_tsv> <group_col>

Options:
--overlay_expr_data Tsv with overlay data for the kegg pathways
--project <project_prefix>   Name to prefix all generated result files [default: ]
--qcutoff <qcutoff>             Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
'

18 19
opts = docopt(doc, commandArgs(TRUE)) ## does not work when spining
# opts = docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" )
20

21
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R")
22

23 24 25
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/diffex_commons.R")
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/cp_utils.R")
26 27
# devtools::source_url("https://www.dropbox.com/s/p2b8luxf7jteb63/cp_utils.R?dl=1")

28 29 30 31 32

load_pack(knitr)
load_pack(DT)


33 34 35 36
resultsBaseName = if (str_length(opts$project) > 0) paste0(opts$project, ".feat_enr") else "feat_enr."

q_cutoff = as.numeric(opts$qcutoff)

37 38

## load the data
39
geneLists = read_tsv(opts$gene_lists_tsv)
40 41 42 43 44 45 46 47 48
group_col = opts$group_col
geneLists %<>% group_by_(.dots = group_col)


########################################################################################################################
#' ## Enrichment Analysis

#' Run configuration was
vec_as_df(unlist(opts)) %>%
49
    filter(! str_detect(name, "^[=]")) %>%
50 51 52 53
    kable()

#' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose,

54
listLabels = geneLists %>%
55 56
    select(- ensembl_gene_id) %>%
    distinct
57

58 59 60 61 62 63 64 65 66 67 68 69 70
listLabels %<>% transform(list_label = do.call(paste, c(listLabels, sep = "__")))

geneLists %>%
    inner_join(listLabels) %>%
    ggplot(aes(list_label)) +
    geom_bar() +
    coord_flip() +
    ggtitle("gene list sizes to be tested for term enrichment") +
    ylab("")




71 72
########################################################################################################################
#' ## Prepare gene feature data
73

74 75 76

## todo guess from data
ens_build = "mar2017"
77 78


79 80 81 82 83 84 85
# geneInfo = quote({
#     mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(counts$ensembl_gene_id), host = paste0(ens_build, ".archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE)
#     c("ensembl_gene_id", "external_gene_name", "start_position", "end_position") %>%
#         biomaRt::getBM(mart = mart) %>%
#         mutate(gene_length=start_position -  end_position) %>%
#         tbl_df
# }) %>% cache_it(paste0("geneInfo_", ens_build))
86 87


88 89
exSummary = quote({
    mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(geneLists$ensembl_gene_id), host = paste0(ens_build, ".archive.ensembl.org"), path = "/biomart/martservice")
90

91
    # library(biomaRt)
92

93 94
    exons = biomaRt::getBM(attributes = c('ensembl_gene_id', 'ensembl_transcript_id', 'ensembl_exon_id', 'rank'), mart = mart)
    txInfo = biomaRt::getBM(attributes = c('ensembl_gene_id', 'ensembl_transcript_id', 'transcript_start', 'transcript_end'), mart = mart)
95 96


97 98 99 100 101
    longestTx = txInfo %>%
        group_by(ensembl_gene_id) %>%
        mutate(tx_length = transcript_end - transcript_start) %>%
        filter(max(tx_length) == tx_length) %>%
        slice(1)
102

103 104 105 106 107 108 109 110
    exons %>%
        group_by(ensembl_gene_id, ensembl_transcript_id) %>%
        summarize(num_exons = max(rank)) %>%
        ungroup %>%
    ## filter for longest transcript
        semi_join(longestTx) %>%
        dplyr::select(- ensembl_transcript_id)
}) %>% cache_it(paste0("exInfo", ens_build))
111 112


113 114 115 116 117 118
# exSummary %>% count(num_exons) %>% n_as("genes_count")
exSummary %>% filter(num_exons < 30 ) %>% ggplot(aes(num_exons)) + geom_bar()
numExTERM2GENE = exSummary %>% filter(num_exons< 15) %>% transmute(term = paste(num_exons, "exons"), gene = ensembl_gene_id)
numExTERM2NAME = numExTERM2GENE %>%
    distinct(term) %>%
    mutate(name = term)
119 120


121 122 123 124 125 126 127 128 129 130 131
install_package("clusterProfiler")
# require(clusterProfiler)


## todo part of core_commons v1.41
first_group = function(groupedDF){
    # groupedDF = geneLists
    # groupedDF = iris %>% group_by(Species)
    # https://stackoverflow.com/questions/33775239/emulate-split-with-dplyr-group-by-return-a-list-of-data-frames
    (groupedDF %>% do(data = (.)) %$% map(data, identity))[[1]]
}
132

133 134
feat_enr_test = function(geneIds){
    # groupDf = first_group(geneLists);  geneIds = groupDf$ensembl_gene_id
135

136
    exonCountResults = clusterProfiler::enricher(gene = geneIds, qvalueCutoff = q_cutoff, TERM2GENE = numExTERM2GENE, TERM2NAME = numExTERM2NAME) %>% as.data.frame()
137

138 139 140 141 142 143
    #    reactomeResults = ReactomePA::gsePathway(gene = geneIds, organism = cpSpecies) %>% summary()
    # goResultsBP = tryCatch({ clusterProfiler::gseGO(geneList = geneIds, OrgDb = annoDb, ont = "BP") %>% summary()}, error = function(err)noResults)

    enrResults = bind_rows(
    mutate(exonCountResults, ontology = "exon_counts")
    # mutate(goResultsCC, ontology = "go_cc")
144
    )
145

146 147 148
    enrResults
}

149
enrResults = geneLists %>% do(feat_enr_test(.$ensembl_gene_id))
150

151 152 153 154 155 156
## purrr but less concise
# feat_enr_test(first_group(geneLists)$ensembl_gene_id)
# enrResults = geneLists %>%
#     nest %>%
#     mutate(enr_results = map(data, ~ feat_enr_test(.x$ensembl_gene_id))) %>%
#     unnest(enr_results)
157 158 159 160 161 162


## remove to clumsy gene_id columns
enrResults %<>% select(- geneID)

write_tsv(enrResults, path = paste0(resultsBaseName, "enrResults.txt"))
163
# enrResults = read.delim(paste0(resultsBaseName, "enrResults.txt"))
164 165 166 167 168 169 170
#'  [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`)

load_pack(DT)
datatable(enrResults)

#enrResults %>% ggplot(aes(pvalue)) + geom_histogram() + scale_x_log10()
#enrResults %>% ggplot(aes(ontology)) + facet_wrap(~cluster) + geom_bar() + rot_x_lab()
171 172
#facetSpecs = paste("~", groups(geneLists) %>% ac %>% paste(collapse=" + "))
facetSpecs = paste("~", group_col %>% ac %>% paste(collapse = " + "))
173 174 175 176 177 178 179 180 181 182 183 184

#' Visualize term-pvalues per list

#http://stackoverflow.com/questions/11028353/passing-string-variable-facet-wrap-in-ggplot-using-r
enrResults %>% ggplot(aes(ontology)) +
    facet_wrap(as.formula(facetSpecs), ncol = 3) +
    geom_bar() +
    rot_x_lab() +
    ggtitle("enriched term counts by cluster")

enrResults %<>% mutate(num_term_genes = str_split_fixed(BgRatio, fixed("/"), 2)[, 1] %>% as.numeric)

185 186
#' Keep at max 20 terms for visualzation per group
erPlotData = enrResults %>%
187 188 189 190 191 192
    group_by_(.dots = c(group_col)) %>%
    arrange(qvalue) %>%
    slice(1 : 20) %>%
## regroup because otherwise dplyr complains about corrupt df (which looks like a bug)
    group_by_(.dots = c(group_col))

193

194 195 196 197 198 199 200
#+ error=TRUE, echo=FALSE
warning("dropping levels")
erPlotData %<>% mutate(ontology = ac(ontology)) ## drop unsused level to get consistent color palette

erPlotData %<>% rename(Term = Description)


201
term_category_colors = create_palette(unique(ac(erPlotData$ontology)))
202

203
figDir = "enr_charts"
204 205 206 207 208 209 210 211 212
dir.create(figDir)

## chop and pad category names
erPlotData %<>% mutate(fixed_width_term = str_sub(Term, 1, 70) %>% str_pad(70))

## evaluate gene ratios strings into actual proportions
erPlotData %<>% mutate(gene_ratio = map_dbl(GeneRatio, ~ eval(parse(text = .x))))

term_barplot_files = erPlotData %>% do({
213 214
    # DEBUG enrResultsGrp = erPlotData %>% head(30)
    enrResultsGrp = .
215 216 217 218 219 220

    label = subset(enrResultsGrp, select = group_col)[1, 1] %>%
        as.matrix %>%
        ac

    ## new version using dotplot https://bioconductor.org/packages/devel/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.html#dotplot
221 222
    enrPlot = enrResultsGrp %>%
        ungroup %>%
223 224
    ## fix factor order
        mutate(fixed_width_term = reorder(fixed_width_term, gene_ratio)) %>%
225
        ggplot(aes(fixed_width_term, gene_ratio, size = Count, fill = p.adjust, color = ontology)) +
226 227 228 229 230 231 232 233
    # ggplot(aes(fixed_width_term, Count)) +
        geom_point(pch = 21) +
        scale_color_manual(values = term_category_colors, drop = F, name = "Ontology") +
        scale_fill_gradient(low = "red", high = "white", name = "q-value") +
        coord_flip() +
    # xlab("Enriched Terms") +
        ggtitle(label)
    # scale_y_log10()
234 235

    ## todo  use builtin method to create filesystem-compatible name
236
    fileNameLabel = label %>%
237 238 239 240 241 242 243 244 245 246 247
        str_replace_all("!=", "ne") %>%
        str_replace_all(">", "gt") %>%
        str_replace_all("<", "lt") %>%
        str_replace_all(fixed("&"), "AND") %>%
        str_replace_all(fixed("|"), "OR") %>%
        str_replace_all(" ", "_")

    #        ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
    #        print(enrPlot)

    stopifnot(str_length(fileNameLabel) > 0)
248
    tmpPng = paste0(figDir, "/enrterms__", fileNameLabel, ".png")
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
    ggsave(tmpPng, enrPlot, width = 10, height = 2 + round(nrow(enrResultsGrp) / 5), limitsize = FALSE)
    data.frame(file = tmpPng)
})

#+ results="asis"
walk(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})

# install.packages("session")
session::save.session(".cp_enrichment.dat");
# session::restore.session(".cp_enrichment.dat");

#' System info:
#+

devtools::session_info()