#!/usr/bin/env Rscript #if(!file.exists(r_script)){ # stop(paste("file does not exist\n", doc)) #} ######################################################################################################################## #' Differential gene Expression Analysis #+ include=FALSE #' TBD: Notes ######################################################################################################################## #+ setup #source("http://bioconductor.org/biocLite.R") #biocLite("RDAVIDWebService", suppressUpdates=T) devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R") devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R") require(cummeRbund) devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R") #options(width=300) ## always overflow boxes to keep tables in shape with scrollbars #options(java.parameters = "-Xmx4g" ) # required by read.xlsx #require.auto(xlsx) suppressMessages(require.auto(docopt)) doc <- ' Convert a cuffdiff into a dge-report Usage: dge_analysis.R <report_name> Options: --cuffdir <DIR> Cache results [default: .] ' #opts <- docopt(doc) opts <- docopt(doc, "test_report") cuffdb_directory=normalizePath(opts$cuffdir) mcdir(opts$report_name) # print(opts) if(is.na(file.info(cuffdb_directory)$isdir)){ stop(paste("db directory does not exist", doc)) } #knitr::opts_knit$set(root.dir = getwd()) ####################################################################################################################### #' # Cuffdiff DB Overview #+ load_db, cache=FALSE #' Used cuffdiff database in `r normalizePath(opts$cuffdir)` ## workaround until cummerbund is fixed tmpdir <- tempfile("cuffdb_dir") system(paste("cp -r", cuffdb_directory, tmpdir)) cuff <- readCufflinks(dir=tmpdir) cuff runInfo(cuff) replicates(cuff) ####################################################################################################################### #' ## Score Distributions and correlation # create some global summary statistic plots dispersionPlot(genes(cuff)) #+ aes(alpha=0.01) csDensity(genes(cuff)) #ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots")) csDensity(genes(cuff),features=T) csBoxplot(genes(cuff),replicates=T) fpkmSCVPlot(genes(cuff)) #csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red")) #MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d") #MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD") #csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot") #csVolcano(genes(cuff),"a", "b") csVolcanoMatrix(genes(cuff)) #csVolcano(genes(cuff),"a", "b") require.auto(edgeR) plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS") MDSplot(genes(cuff)) + ggtitle("mds plot") MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot") PCAplot(genes(cuff),"PC1","PC2",replicates=T) noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg} noTextLayer(csDistHeat(genes(cuff)) + ggtitle("mouse zone correlation")) noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("mouse replicate correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15)) csDendro(genes(cuff),replicates=T) #pdf(paste0(basename(getwd()),"_rep_clustering.pdf")) #csDendro(genes(cuff),replicates=T) #dev.off() ####################################################################################################################### #' ## Extract and save tables #xls(runInfo(cuff)) #setwd("/Volumes/project-zeigerer/Rab5_NGS/results/") geneFpkm<-fpkm(genes(cuff)) write.delim(geneFpkm, file="geneFpkm.txt") # gene.fpkm <- read.delim("gene.fpkm.txt") geneCounts<-count(genes(cuff)) write.delim(geneCounts, file="geneCounts.txt") geneCgene.matrix<-fpkmMatrix(genes(cuff)) repGeneFPKMs <- repFpkmMatrix(genes(cuff)) save(repGeneFPKMs, file="repGeneFPKMs.RData") write.delim(repGeneFPKMs, file="repGeneFPKMs.txt") # repGeneFPKMs <- read.delim("repGeneFPKMs.txt") # repGeneFPKMs <- local(get(load("repGeneFPKMs.RData"))) ####################################################################################################################### #' ## Extract lists of differentially expressed genes coi <- c("aRGm", "aRGp") allDiff <- diffData(genes(cuff)) %>% filter(sample_1 %in% coi, sample_2 %in% coi) %>% ## discard all genes that are not expressed in any of the cell types (>1) filter(gene_id %in% getExpressedGenes(cuff)) #degs <- subset(allDiff, q_value<0.01) allDiff <- transform(allDiff, isHit=p_value<=0.01) degs <- subset(allDiff, isHit) #degs <- subset(allDiff, significant=="yes") degs <- transform(degs, sample_1_overex=log2_fold_change<0) with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0) ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip() ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip() ## todo move up into table export section ## add gene description if(!file.exists("geneInfo.RData")){ require(biomaRt) mousemart = useDataset("mmusculus_gene_ensembl", mart=useMart("ensembl")) geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart); save(geneInfo, file="geneInfo.RData") }else{ geneInfo <- local(get(load("geneInfo.RData"))) } gFpkmMatrix <- rownames2column(fpkmMatrix(genes(cuff)), "ensembl_gene_id") #colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)]) geneFpkmWithInfo <- merge(geneInfo, gFpkmMatrix, by.x="ensembl_gene_id", all.y=T) %>% print_head() write.delim(geneFpkmWithInfo, file="geneFpkmWithInfo.txt") # geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt") #' [annotated expresssion table](geneFpkmWithInfo.txt) degs <- merge(degs, geneInfo, by.x="gene_id", by.="ensembl_gene_id", all.x=T) subset(degs, !duplicated(paste(sample_1, sample_2, sample_1_overex, external_gene_name))) %>% with(., as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq>0) write.delim(degs, file="degs.txt") # degs <- read.delim("degs.txt") #' [dge table](degs.txt) ######################################################################################################################## #` ## Term enrichment #+ echo=FALSE require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/) geneLists <- degs %>% transmute(gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_")) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf ## e.g. getClusterReport --> plot2D save(degs, file="degs.RData") # degs <- local(get(load("degs.RData"))) davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id echo("processing list with", length(someGenes), "genes") # someGenes <- degs$gene_id if(length(someGenes)>1500){ someGenes <- sample(someGenes) %>% head(1500) } david<-DAVIDWebService$new(email="brandl@mpi-cbg.de") # getTimeOut(david) setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene") david %>% setAnnotationCategories(c( "GOTERM_CC_FAT", "GOTERM_MF_FAT", "GOTERM_BP_FAT", "PANTHER_PATHWAY", "REACTOME_PATHWAY", "KEGG_PATHWAY", "GOTERM_CC_FAT", "GOTERM_MF_FAT", "GOTERM_BP_FAT" )) annoChart <-getFunctionalAnnotationChart(david) # clusterReport <-getClusterReport(david) return(annoChart %>% subset(select=-Genes)) } enrResults <- degs %>% group_by(sample_1, sample_2, sample_1_overex) %>% do(davidAnnotationChart(.$gene_id)) write.delim(enrResults, file="enrResults.txt") # enrResults <- read.delim("enrResults.txt") #' [Enrichment Results](enrResults.txt) sigEnrResults <- subset(enrResults, Bonferroni <0.01) write.delim(sigEnrResults, file="sigEnrResults.txt") # sigEnrResults <- read.delim("sigEnrResults.txt") #' [Very Significant Terms](sigEnrResults.txt) ## plot the enrichment results #sigEnrResults %>% group_by(Category, add=T) %>% do({ # logPlot <- . %>% ggplot(aes(Term, PValue)) + # geom_bar(stat="identity")+coord_flip() + # xlab("Enriched Terms") + # ggtitle(.$Category[1]) + # scale_y_log10() # print(logPlot) #}) sigEnrResults %>% do({ enrResultsGrp <- . label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_") echo("processing", label) logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) + geom_bar(stat="identity")+ coord_flip() + xlab("Enriched Terms") + ggtitle(label) + scale_y_log10() print(logPlot) }) #+ include=FALSE #ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type)) #sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni) #sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term) #ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set) #ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10() #ggsave2(width=14, height=12) #write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t") ######################################################################################################################## #' ## Enriched KEGG Pathways #+ echo=FALSE, warning=FALSE require(pathview) require(png) with(sigEnrResults, as.data.frame(table(Category))) keggPathways <- sigEnrResults %>% filter(Category=="KEGG_PATHWAY") %>% transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description'))) if(nrow(keggPathways)==0){ echo("no enriched kegg pathways were found in the dataset") } keggPathways %>% rowwise() %>% do({ curKP <- . curKP <- keggPathways[1,] geneData <- merge(curKP, degs) %>% transmute(gene_id, log2_fold_change) %>% column2rownames("gene_id") ## prepare gene data pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl") outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "") ima <- readPNG(outfile) plot.new() lim <- par() rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4]) }) #+ include=FALSE # ######################################################################################################################## # #' ## Protein-Protein Interaction Clusters using String # # ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf # # #' We can use the string-db web service to look for protein-protein interactions. # #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters. # #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates # # #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE # # #taxTable <- list(ENSG=9606, ENSMUSG=10090) # #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]] # # # #require.auto(STRINGdb) # # # #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400 # #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE ) # #clustersList <- string_db$get_clusters(example1_mapped$STRING_id) # #for(network in clustersList) # #{ # # if(length(network)>5) # # { # # string_db$plot_network(network, add_link=FALSE) # # } # #} #