Skip to content
Snippets Groups Projects
Commit e01703db authored by Melanie Schneider's avatar Melanie Schneider
Browse files

guess species and mart implemented and improved panther_enrichment

parent 506874b5
No related branches found
No related tags found
No related merge requests found
......@@ -7,39 +7,70 @@ devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v
require(reshape)
library(splitstackshape)
guess_species <- function(gene_id){
an_id <-gene_id[1]
if(str_detect(an_id, "ENSMUSG")){
return("mouse")
}else if(str_detect(an_id, "ENSDARG")){
return("zebrafish")
}else if(str_detect(an_id, "ENSG")){
return("human")
}else if(str_detect(an_id, "FBgn")){
return("fruit_fly")
}else{
stop(paste("could not guess species from ", an_id))
}
}
---------------------------------------------------------------
## load the data
#geneLists <- local(get(load("smb://fileserver.mpi-cbg.de/project-ste/rnaseq_isnm1/simone_reanalysis/results/human/diffex_genes.txt")))
geneLists <- read.delim("/home/mel/MPI-CBG_work/P6_enrichment/diffex_genes.txt") %>% select(ensembl_gene_id)
#geneLists <- select(geneLists, ensembl_gene_id)
## PANTHER10 terms + protein classes
PANTHER10_terms_more <- read.delim("http://data.pantherdb.org/PANTHER10.0/ontology/Protein_Class_7.0") #%>% select(PC00000, protein.class)
PANTHER10_terms <- select(PANTHER10_terms_more, PC00000, protein.class)
## PANTHER10 terms mapped to different genomes
## mouse
#PANTHER10_terms_to_genes_mouse <- read.delim("ftp://ftp.pantherdb.org/sequence_classifications/10.0/PANTHER_Sequence_Classification_files/PTHR10.0_mouse", header = F)
## human
PANTHER10_terms_to_genes_human <- read.delim("ftp://ftp.pantherdb.org/sequence_classifications/10.0/PANTHER_Sequence_Classification_files/PTHR10.0_human", header = F)
## zebrafish
#PANTHER10_terms_to_genes_zebrafish <- read.delim("ftp://ftp.pantherdb.org/sequence_classifications/10.0/PANTHER_Sequence_Classification_files/PTHR10.0_zebrafish", header = F)
## Other species can be found here: ftp://ftp.pantherdb.org/sequence_classifications/10.0/PANTHER_Sequence_Classification_files/
## guess mart, guess species
library('biomaRt')
mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
species <- guess_species(geneLists$ensembl_gene_id)
## PANTHER10 terms mapped to genes
## Files can be found here: ftp://ftp.pantherdb.org/sequence_classifications/10.0/PANTHER_Sequence_Classification_files/
species_url <- sprintf("ftp://ftp.pantherdb.org/sequence_classifications/10.0/PANTHER_Sequence_Classification_files/PTHR10.0_%s", species)
PANTHER10_terms_to_genes <- read.delim(species_url, header = F)
## Protein_class_relationship for constructing trees - NOT USED
#PANTHER10_protein_class_relationship <- read.delim("http://data.pantherdb.org/PANTHER10.0/ontology/Protein_class_relationship")
## Data treatment
## remove not needed columns from data frame, unite columns, delete rows with no term information
PANTHER10_terms_to_genes <- within(PANTHER10_terms_to_genes_human, rm(V2, V4, V5)) %>% unite(united,V6,V7,V8,V9,V10, sep=";")
PANTHER10_terms_to_genes <- within(PANTHER10_terms_to_genes, rm(V2, V4, V5)) %>% unite(united,V6,V7,V8,V9,V10, sep=";")
PANTHER10_terms_to_genes <- PANTHER10_terms_to_genes[-grep("\\;;;;",PANTHER10_terms_to_genes$united),]
myPattern <- "PC[0-9]{5}"
PANTHER10_terms_to_genes$PCnum <- str_extract_all(PANTHER10_terms_to_genes$united, myPattern)
PANTHER10_terms_to_genes$uniprot_sptrembl <- str_extract(PANTHER10_terms_to_genes$V1, "......$")
PANTHER10_terms_to_genes$UniProtKB <- str_extract(PANTHER10_terms_to_genes$V1, "......$")
## mapping UniProtIDs to EnsembleIDs- doesn't find all IDs
library('biomaRt')
ensembl <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
ensembl <- useMart('ensembl', dataset="hsapiens_gene_ensembl")
annotation <- getBM(attributes=c("uniprot_sptrembl", "ensembl_gene_id"), filters="uniprot_sptrembl", values=as.vector(PANTHER10_terms_to_genes$uniprot_sptrembl), mart=ensembl)
PANTHER10_terms_to_genes <- left_join(PANTHER10_terms_to_genes, annotation, by = "uniprot_sptrembl")
## mapping EnsembleIDs to UniProtIDs and unnest
annotation <- getBM(attributes=c("uniprot_sptrembl", "ensembl_gene_id"), mart=mart) #, filters="uniprot_sptrembl", values=as.vector(PANTHER10_terms_to_genes$UniProtKB))
PANTHER10_terms_to_genes <- left_join(PANTHER10_terms_to_genes, annotation, by =c("UniProtKB"="uniprot_sptrembl"))
annotation2 <- getBM(attributes=c("uniprot_swissprot", "ensembl_gene_id"), mart=mart) #, filters="uniprot_swissprot", values=as.vector(PANTHER10_terms_to_genes$UniProtKB))
PANTHER10_terms_to_genes <- left_join(PANTHER10_terms_to_genes, annotation2, by = c("UniProtKB"="uniprot_swissprot")) %>%
unnest() %>% select(PCnum, ensembl_gene_id)
## Try for geneLists - doesn't find all IDs either
annotation_gene_list <- getBM(attributes=c("uniprot_sptrembl", "ensembl_gene_id"), filters="ensembl_gene_id", values=as.vector(geneLists$ensembl_gene_id), mart=ensembl)
## checking an ID
#PANTHER10_terms_to_genes %>% filter(uniprot_sptrembl=='P62805')
#annotation %>% filter(uniprot_sptrembl=='P62805')
##Test-grep
#grep("PC[0-9]", PANTHER10_terms_to_genes$united, value = TRUE)
......@@ -49,22 +80,6 @@ annotation_gene_list <- getBM(attributes=c("uniprot_sptrembl", "ensembl_gene_id"
#PANTHER10_terms_to_genes <- cSplit(PANTHER10_terms_to_genes, splitCols = "united", sep = ";", drop = TRUE)
## Protein_class_relationship for constructing trees
PANTHER10_protein_class_relationship <- read.delim("http://data.pantherdb.org/PANTHER10.0/ontology/Protein_class_relationship")
#geneLists <- local(get(load("smb://fileserver.mpi-cbg.de/project-ste/rnaseq_isnm1/simone_reanalysis/results/human/diffex_genes.txt")))
geneLists <- read.delim("/home/mel/MPI-CBG_work/P6_enrichment/diffex_genes.txt") %>% select(ensembl_gene_id)
#geneLists %<>% filter(cluster %in% c("cluster_1", "cluster_2"))
if(!is.null(opts$overlay_expr_data)){
overlayData <- read.delim(opts$overlay_expr_data)
}
resultsBaseName=basename(opts$grouped_gene_lists_rdata) %>% trim_ext(".RData") %>% paste0(".")
########################################################################################################################
#' ## Enrichment Analysis
......@@ -80,6 +95,11 @@ geneLists %>% inner_join(listLabels) %>%
ggtitle("gene list sizes to be tested for term enrichment") +
ylab("")
pantherResults <- enricher(gene = geneIds, organism = cpSpecies, pvalueCutoff = pCutoff, readable = TRUE, TERM2GENE = PANTHER10_ontology) %>% summary()
pantherResults <- enricher(gene = geneLists$ensembl_gene_id,
organism = species,
pvalueCutoff = pCutoff,
readable = TRUE,
TERM2GENE = PANTHER10_terms_to_genes,
TERM2NAME = PANTHER10_terms) %>% summary()
enrResults <- rbind_list(mutate(pantherResults, ontology="panther"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment