Skip to content
Snippets Groups Projects
Commit 8e25fe32 authored by Holger Brandl's avatar Holger Brandl
Browse files

started generic dge report

parent 9c14c590
No related branches found
No related tags found
No related merge requests found
#+ include=FALSE
## cd ~/mnt/meninges/meninges/plus_minus
## source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
## spinr ~/mnt/meninges/from_holger/scripts/plusminus/DiffexPlusMinus.R
## cd ~/mnt/meninges/meninges/plus_minus_only
########################################################################################################################
#' # Mouse aRG +/- Differential gene Expression Analysis
#' Alignment done already in May 2014: `/Volumes/meninges/from_holger/mouse/mouse_aRGpm`
#' 2 Cuffdiff configratutions possilbe (just plus and minus or complete mouse data from paper)
########################################################################################################################
#+ setup2, cache=FALSE, message=FALSE, echo=FALSE
#source("http://bioconductor.org/biocLite.R")
#biocLite("RDAVIDWebService")
require(biomaRt)
require(RDAVIDWebService)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")
options(width=300) ## always overflow boxes to keep tables in shape with scrollbars
options(java.parameters = "-Xmx4g" ) # required by read.xlsx
require.auto(xlsx)
#dbDir="/home/brandl/mnt/meninges/from_holger/mouse/mouse_aRGpm/cuffdiff"
dbDir=dirname(getwd())
#setwd("~/mnt/meninges/meninges/plus_minus_only")
#dbDir="/home/brandl/mnt/meninges/from_holger/mouse/mouse_aRGpm/cuffdiff_pm"
#mcdir(file.path(dirname(dbDir), "dge_analysis"))
#mcdir(file.path(dbDir, "dge_analysis"))
#knitr::opts_knit$set(root.dir = getwd())
#######################################################################################################################
#' # Cuffdiff Report
#+ load_db, cache=FALSE
cuff <- readCufflinks(dir=dbDir)
cuff
runInfo(cuff)
replicates(cuff)
#######################################################################################################################
#' ## Quality Control
# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
fpkmSCVPlot(genes(cuff))
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
#csVolcano(genes(cuff),"a", "b")
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
MDSplot(genes(cuff)) + ggtitle("mds plot")
MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
PCAplot(genes(cuff),"PC1","PC2",replicates=T)
noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
noTextLayer(csDistHeat(genes(cuff)) + ggtitle("mouse zone correlation"))
noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("mouse replicate correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
csDendro(genes(cuff),replicates=T)
#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()
#######################################################################################################################
#' ## Extract and save tables
#xls(runInfo(cuff))
#setwd("/Volumes/project-zeigerer/Rab5_NGS/results/")
geneFpkm<-fpkm(genes(cuff))
write.delim(geneFpkm, file="geneFpkm.txt")
# gene.fpkm <- read.delim("gene.fpkm.txt")
geneCounts<-count(genes(cuff))
write.delim(geneCounts, file="geneCounts.txt")
geneCgene.matrix<-fpkmMatrix(genes(cuff))
repGeneFPKMs <- repFpkmMatrix(genes(cuff))
save(repGeneFPKMs, file="repGeneFPKMs.RData")
write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
# repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
# repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
coi <- c("aRGm", "aRGp")
allDiff <- diffData(genes(cuff)) %>%
filter(sample_1 %in% coi, sample_2 %in% coi) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff))
#degs <- subset(allDiff, q_value<0.01)
allDiff <- transform(allDiff, isHit=p_value<=0.01)
degs <- subset(allDiff, isHit)
#degs <- subset(allDiff, significant=="yes")
degs <- transform(degs, sample_1_overex=log2_fold_change<0)
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## todo move up into table export section
## add gene description
if(!file.exists("geneInfo.RData")){
mousemart = useDataset("mmusculus_gene_ensembl", mart=useMart("ensembl"))
geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
save(geneInfo, file="geneInfo.RData")
}else{
geneInfo <- local(get(load("geneInfo.RData")))
}
gFpkmMatrix <- rownames2column(fpkmMatrix(genes(cuff)), "ensembl_gene_id")
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
geneFpkmWithInfo <- merge(geneInfo, gFpkmMatrix, by.x="ensembl_gene_id", all.y=T) %>% print_head()
write.delim(geneFpkmWithInfo, file="geneFpkmWithInfo.txt")
# geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt")
#' [annotated expresssion table](geneFpkmWithInfo.txt)
degs <- merge(degs, geneInfo, by.x="gene_id", by.="ensembl_gene_id", all.x=T)
subset(degs, !duplicated(paste(sample_1, sample_2, sample_1_overex, external_gene_name))) %>% with(., as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq>0)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
########################################################################################################################
#` ## Term enrichment
#+ echo=FALSE
geneLists <- degs %>% transmute(gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D
save(degs, file="degs.RData")
# degs <- local(get(load("degs.RData")))
davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
echo("processing list with", length(someGenes), "genes")
# someGenes <- degs$gene_id
if(length(someGenes)>1500){
someGenes <- sample(someGenes) %>% head(1500)
}
david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")
# getTimeOut(david)
setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")
david %>% setAnnotationCategories(c(
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
"KEGG_PATHWAY",
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT"
))
annoChart <-getFunctionalAnnotationChart(david)
# clusterReport <-getClusterReport(david)
return(annoChart %>% subset(select=-Genes))
}
enrResults <- degs %>% group_by(sample_1, sample_2, sample_1_overex) %>% do(davidAnnotationChart(.$gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
echo("processing", label)
logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
print(logPlot)
})
#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)
#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
########################################################################################################################
#' ## Enriched KEGG Pathways
#+ echo=FALSE, warning=FALSE
require(pathview)
require(png)
with(sigEnrResults, as.data.frame(table(Category)))
keggPathways <- sigEnrResults %>%
filter(Category=="KEGG_PATHWAY") %>%
transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
if(nrow(keggPathways)==0){
echo("no enriched kegg pathways were found in the dataset")
}
keggPathways %>% rowwise() %>% do({
curKP <- .
curKP <- keggPathways[1,]
geneData <- merge(curKP, degs) %>% transmute(gene_id, log2_fold_change) %>% column2rownames("gene_id")
## prepare gene data
pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
ima <- readPNG(outfile)
plot.new()
lim <- par()
rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
})
#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606, ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# # if(length(network)>5)
# # {
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment