started generic dge report

8e25fe32 · Holger Brandl · 9c14c590 · 8e25fe32
Commit 8e25fe32 authored 10 years ago by Holger Brandl
--- a/dge_workflow/dge_analysis.R
+++ b/dge_workflow/dge_analysis.R
+#+ include=FALSE
+## cd ~/mnt/meninges/meninges/plus_minus
+## source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
+## spinr ~/mnt/meninges/from_holger/scripts/plusminus/DiffexPlusMinus.R
+
+## cd ~/mnt/meninges/meninges/plus_minus_only
+
+
+########################################################################################################################
+#' # Mouse aRG +/- Differential gene Expression Analysis
+
+#' Alignment done already in May 2014: `/Volumes/meninges/from_holger/mouse/mouse_aRGpm`
+#' 2 Cuffdiff configratutions possilbe (just plus and minus or complete mouse data from paper)
+
+########################################################################################################################
+#+ setup2, cache=FALSE, message=FALSE, echo=FALSE
+
+
+#source("http://bioconductor.org/biocLite.R")
+#biocLite("RDAVIDWebService")
+
+require(biomaRt)
+require(RDAVIDWebService)
+
+
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
+
+require(cummeRbund)
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")
+
+
+options(width=300) ## always overflow boxes to keep tables in shape with scrollbars
+options(java.parameters = "-Xmx4g" ) # required by read.xlsx
+
+require.auto(xlsx)
+
+
+#dbDir="/home/brandl/mnt/meninges/from_holger/mouse/mouse_aRGpm/cuffdiff"
+dbDir=dirname(getwd())
+
+#setwd("~/mnt/meninges/meninges/plus_minus_only")
+#dbDir="/home/brandl/mnt/meninges/from_holger/mouse/mouse_aRGpm/cuffdiff_pm"
+
+#mcdir(file.path(dirname(dbDir), "dge_analysis"))
+#mcdir(file.path(dbDir, "dge_analysis"))
+#knitr::opts_knit$set(root.dir = getwd())
+
+
+#######################################################################################################################
+#' # Cuffdiff Report
+#+ load_db, cache=FALSE
+
+
+cuff <- readCufflinks(dir=dbDir)
+
+cuff
+runInfo(cuff)
+replicates(cuff)
+
+
+
+#######################################################################################################################
+#' ## Quality Control
+
+
+# create some global summary statistic plots
+dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
+
+csDensity(genes(cuff))
+#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
+
+csDensity(genes(cuff),features=T)
+csBoxplot(genes(cuff),replicates=T)
+
+fpkmSCVPlot(genes(cuff))
+
+
+#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
+
+#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
+#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
+
+
+#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
+
+
+#csVolcano(genes(cuff),"a", "b")
+
+csVolcanoMatrix(genes(cuff))
+#csVolcano(genes(cuff),"a", "b")
+
+
+
+require.auto(edgeR)
+plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
+
+MDSplot(genes(cuff)) + ggtitle("mds plot")
+
+MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
+
+PCAplot(genes(cuff),"PC1","PC2",replicates=T)
+
+noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
+
+noTextLayer(csDistHeat(genes(cuff)) +  ggtitle("mouse zone correlation"))
+
+noTextLayer(csDistHeat(genes(cuff),replicates=T) +  ggtitle("mouse replicate correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
+
+csDendro(genes(cuff),replicates=T)
+
+#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
+#csDendro(genes(cuff),replicates=T)
+#dev.off()
+
+
+#######################################################################################################################
+#' ## Extract and save tables
+
+#xls(runInfo(cuff))
+#setwd("/Volumes/project-zeigerer/Rab5_NGS/results/")
+
+geneFpkm<-fpkm(genes(cuff))
+write.delim(geneFpkm, file="geneFpkm.txt")
+# gene.fpkm <- read.delim("gene.fpkm.txt")
+
+geneCounts<-count(genes(cuff))
+write.delim(geneCounts, file="geneCounts.txt")
+geneCgene.matrix<-fpkmMatrix(genes(cuff))
+
+repGeneFPKMs <- repFpkmMatrix(genes(cuff))
+save(repGeneFPKMs, file="repGeneFPKMs.RData")
+write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
+# repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
+# repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
+
+
+
+#######################################################################################################################
+#' ## Extract lists of differentially expressed genes
+
+
+coi <- c("aRGm", "aRGp")
+allDiff <- diffData(genes(cuff)) %>%
+    filter(sample_1 %in% coi, sample_2 %in% coi) %>%
+    ##  discard all genes that are not expressed in any of the cell types (>1)
+    filter(gene_id %in% getExpressedGenes(cuff))
+
+
+
+#degs <- subset(allDiff, q_value<0.01)
+allDiff <- transform(allDiff, isHit=p_value<=0.01)
+
+degs <- subset(allDiff, isHit)
+#degs <- subset(allDiff, significant=="yes")
+degs <- transform(degs, sample_1_overex=log2_fold_change<0)
+
+with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
+
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
+
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
+
+
+## todo move up into table export section
+## add gene description
+if(!file.exists("geneInfo.RData")){
+    mousemart = useDataset("mmusculus_gene_ensembl", mart=useMart("ensembl"))
+    geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
+    save(geneInfo, file="geneInfo.RData")
+}else{
+    geneInfo <- local(get(load("geneInfo.RData")))
+}
+
+gFpkmMatrix <- rownames2column(fpkmMatrix(genes(cuff)), "ensembl_gene_id")
+#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
+geneFpkmWithInfo <- merge(geneInfo, gFpkmMatrix, by.x="ensembl_gene_id", all.y=T) %>% print_head()
+
+
+write.delim(geneFpkmWithInfo, file="geneFpkmWithInfo.txt")
+# geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt")
+
+#' [annotated expresssion table](geneFpkmWithInfo.txt)
+
+
+degs <- merge(degs, geneInfo, by.x="gene_id", by.="ensembl_gene_id", all.x=T)
+
+subset(degs, !duplicated(paste(sample_1, sample_2, sample_1_overex, external_gene_name))) %>% with(., as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq>0)
+
+
+write.delim(degs, file="degs.txt")
+# degs <- read.delim("degs.txt")
+#' [dge table](degs.txt)
+
+
+########################################################################################################################
+#` ## Term enrichment
+#+ echo=FALSE
+
+
+geneLists <- degs %>%  transmute(gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
+
+
+## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
+## e.g. getClusterReport --> plot2D
+
+save(degs, file="degs.RData")
+# degs <- local(get(load("degs.RData")))
+
+
+davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
+    echo("processing list with", length(someGenes), "genes")
+#    someGenes <- degs$gene_id
+
+
+    if(length(someGenes)>1500){
+        someGenes <- sample(someGenes) %>% head(1500)
+    }
+
+    david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")
+
+#    getTimeOut(david)
+    setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
+
+    result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")
+
+    david %>% setAnnotationCategories(c(
+            "GOTERM_CC_FAT",
+            "GOTERM_MF_FAT",
+            "GOTERM_BP_FAT",
+
+            "PANTHER_PATHWAY",
+            "REACTOME_PATHWAY",
+            "KEGG_PATHWAY",
+
+            "GOTERM_CC_FAT",
+            "GOTERM_MF_FAT",
+            "GOTERM_BP_FAT"
+    ))
+
+    annoChart <-getFunctionalAnnotationChart(david)
+
+#    clusterReport <-getClusterReport(david)
+
+    return(annoChart %>% subset(select=-Genes))
+}
+
+
+enrResults <- degs %>% group_by(sample_1, sample_2, sample_1_overex) %>% do(davidAnnotationChart(.$gene_id))
+
+write.delim(enrResults, file="enrResults.txt")
+# enrResults <- read.delim("enrResults.txt")
+#'  [Enrichment Results](enrResults.txt)
+
+sigEnrResults <- subset(enrResults, Bonferroni <0.01)
+
+write.delim(sigEnrResults, file="sigEnrResults.txt")
+# sigEnrResults <- read.delim("sigEnrResults.txt")
+#'  [Very Significant Terms](sigEnrResults.txt)
+
+
+## plot the enrichment results
+#sigEnrResults %>% group_by(Category, add=T) %>% do({
+#    logPlot <- . %>% ggplot(aes(Term, PValue)) +
+#	    geom_bar(stat="identity")+coord_flip() +
+#	    xlab("Enriched Terms") +
+#	    ggtitle(.$Category[1]) +
+#	    scale_y_log10()
+#	    print(logPlot)
+#})
+
+sigEnrResults %>% do({
+    enrResultsGrp <- .
+    label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
+    echo("processing", label)
+
+    logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
+	    geom_bar(stat="identity")+
+	    coord_flip() +
+	    xlab("Enriched Terms") +
+	    ggtitle(label) +
+	    scale_y_log10()
+
+	    print(logPlot)
+})
+
+#+ include=FALSE
+#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
+
+#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
+#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
+
+#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
+#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
+#ggsave2(width=14, height=12)
+
+#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE,  sep="\t")
+
+
+########################################################################################################################
+#' ## Enriched KEGG Pathways
+#+ echo=FALSE, warning=FALSE
+
+require(pathview)
+require(png)
+
+
+with(sigEnrResults, as.data.frame(table(Category)))
+
+keggPathways <- sigEnrResults %>%
+    filter(Category=="KEGG_PATHWAY") %>%
+    transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
+
+if(nrow(keggPathways)==0){
+    echo("no enriched kegg pathways were found in the dataset")
+}
+
+
+keggPathways %>% rowwise() %>% do({
+    curKP <- .
+    curKP <- keggPathways[1,]
+    geneData <- merge(curKP, degs) %>% transmute(gene_id, log2_fold_change) %>% column2rownames("gene_id")
+    ## prepare gene data
+
+    pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
+        outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
+        ima <- readPNG(outfile)
+
+    	plot.new()
+    	lim <- par()
+    	rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
+
+})
+
+
+
+#+ include=FALSE
+# ########################################################################################################################
+# #' ## Protein-Protein Interaction Clusters using String
+#
+# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
+#
+# #' We can use the string-db web service to look for protein-protein interactions.
+# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
+# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
+#
+# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
+#
+# #taxTable <- list(ENSG=9606,  ENSMUSG=10090)
+# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
+# #
+# #require.auto(STRINGdb)
+# #
+# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
+# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
+# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
+# #for(network in clustersList)
+# #{
+# #	if(length(network)>5)
+# #	{
+# #		string_db$plot_network(network, add_link=FALSE)
+# #	}
+# #}
+#
\ No newline at end of file