#!/usr/bin/env Rscript #+ setup, cache=FALSE suppressMessages(require(docopt)) doc <- ' Convert a cuffdiff into a dge-report Usage: dge_analysis.R [options] <cuffdb_directory> Options: --constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed --undirected Perform directed dge-steps in a directed manner --qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01] --pcutoff <pcutoff> Override q-value filter and filter by p-value instead -S Dont do general statistics and qc analysis ' #opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining # Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05 ..") # opts <- docopt(doc, "--undirected ..") # opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), "..")) opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" "))) #opts skipQC <<- opts$S split_hit_list <- !opts$undirected constrasts_file <- opts$constrasts pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff) qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff) stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff)) cuffdb_directory <- normalizePath(opts$cuffdb_directory) if(is.na(file.info(cuffdb_directory)$isdir)){ stop(paste("db directory does not exist", doc)) } devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R") devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R") require(cummeRbund) devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R") #knitr::opts_knit$set(root.dir = getwd()) ######################################################################################################################## #' # Differential Gene Expression Analysis ## todo Small overview about pipeline #' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html). #print(paste("db dir is", cuffdb_directory)) #' Used cuffdiff database in `r cuffdb_directory` #+ load_db, cache=FALSE ## workaround until sqlite is fixed on lustre tmpdir <- tempfile("cuffdb_dir") system(paste("cp -r", cuffdb_directory, tmpdir)) cuff <- readCufflinks(dir=tmpdir) cuff runInfo(cuff) replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale)) replicateInfo write.delim(replicateInfo, file="replicateInfo.txt") # replicateInfo <- read.delim("replicateInfo.txt") #' [replicateInfo](replicateInfo.txt) #+ eval=skipQC #hello hidden field ####################################################################################################################### #' ## Count and Expression Score Tables ## todo merge in basic gene info into all tables ## add gene description martName <- guess_mart(fpkm(genes(cuff))$gene_id) cacheFile <- paste0("geneInfo.",martName, ".RData") if(!file.exists(cacheFile)){ require(biomaRt) mousemart = useDataset(martName, mart=useMart("ensembl")) geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart); save(geneInfo, file=cacheFile) unloadNamespace('biomaRt') }else{ geneInfo <- local(get(load(cacheFile))) } #colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)]) fpkmMatrix(genes(cuff)) %>% rownames2column("ensembl_gene_id") %>% merge(geneInfo, by="ensembl_gene_id", all.x=T) %>% print_head() %>% write.delim(file="gene_fpkms.txt") # gene_fpkms <- read.delim("gene_fpkms.txt") #' [Annotated Expresssion Table](gene_fpkms.txt) ## export the same but now including replicate information repFpkmMatrix(genes(cuff)) %>% rownames2column("ensembl_gene_id") %>% merge(geneInfo, by="ensembl_gene_id", all.x=T) %>% print_head() %>% write.delim(file="gene_fpkms_by_replicate.txt") # gene_fpkms <- read.delim("gene_fpkms_by_replicate.txt") #' [Annotated Expresssion Table](gene_fpkms_by_replicate.txt) fpkm(genes(cuff)) %>% dplyr::rename(ensembl_gene_id=gene_id) %>% merge(geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="geneFpkm.txt") ## also export count tables count(genes(cuff)) %>% dplyr::rename(ensembl_gene_id=gene_id) %>% merge(geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="gene_counts.txt") # gene_counts <- read.delim("gene_counts.txt") #' [Gene Fragment Counts](gene_counts.txt) countMatrix(genes(cuff)) %>% rownames2column("ensembl_gene_id") %>% merge(geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="replicateCounts.txt") # repCounts <- read.delim("replicateCounts.txt") #' [Raw Counts by Replicate](replicateCounts.txt) ##ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10() ####################################################################################################################### #' ## Score Distributions #if(!as.logical(opts$S)){ # create some global summary statistic plots dispersionPlot(genes(cuff)) #+ aes(alpha=0.01) csDensity(genes(cuff)) #ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots")) csDensity(genes(cuff),features=T) csBoxplot(genes(cuff),replicates=T) fpkmSCVPlot(genes(cuff)) #csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red")) #MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d") #MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD") #csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot") #csVolcano(genes(cuff),"a", "b") ## todo replace or complement with marta plot csVolcanoMatrix(genes(cuff)) #csVolcano(genes(cuff),"a", "b") ####################################################################################################################### #' ## Replicate Correlation and Clustering require.auto(edgeR) plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS") if(unlen(replicates(cuff)$sample)>2){ ## >2 only, because MDS will fail with just 2 samples MDSplot(genes(cuff)) + ggtitle("mds plot") } MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot") ## todo replace with marta plot #PCAplot(genes(cuff),"PC1","PC2",replicates=T) noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg} ## todo use fix scaled here noTextLayer(csDistHeat(genes(cuff)) + ggtitle("Sample correlation")) noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15)) csDendro(genes(cuff),replicates=T) #pdf(paste0(basename(getwd()),"_rep_clustering.pdf")) #csDendro(genes(cuff),replicates=T) #dev.off() ####################################################################################################################### #' ## Extract lists of differentially expressed genes #compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst") if(!is.null(constrasts_file)){ # if(file.exists(constrasts_file)) echo("using contrasts matrix from: ", basename(constrasts_file)) compPairsUniDir <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2")) ## add other direction for completeness compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1)) }else{ echo("comparing all samples against each other") compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct() } allDiff <- diffData(genes(cuff)) %>% ## just keep pairs of interest merge(compPairs) %>% ## discard all genes that are not expressed in any of the cell types (>1) filter(gene_id %in% getExpressedGenes(cuff)) %>% dplyr::rename(ensembl_gene_id=gene_id) %>% mutate(sample_1_overex=log2_fold_change<0) ## save as reference merge(allDiff, geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="diffData.txt") # diffData <- read.delim("diffData.txt") #' [diffData](diffData.txt) ## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")` #+ results='asis' if(!is.null(qcutoff)){ echo("Using q-value cutoff of ", qcutoff) allDiff <- transform(allDiff, isHit=q_value<=qcutoff) }else{ echo("Using p-value cutoff of ", pcutoff) allDiff <- transform(allDiff, isHit=p_value<=pcutoff) } #+ # #' Used cutoff criterion was: q_value<0.01 # allDiff <- transform(allDiff, isHit=q_value<0.01) degs <- subset(allDiff, isHit) #degs <- subset(allDiff, significant=="yes") with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0) ## add gene info degs <- merge(degs, geneInfo, by="ensembl_gene_id", all.x=T) write.delim(degs, file="degs.txt") # degs <- read.delim("degs.txt") #' [dge table](degs.txt) ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip() ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip() ## just needed to restore environment easily save(degs, file=".degs.RData") # degs <- local(get(load(".degs.RData"))) ######################################################################################################################## #' ## Term enrichment #+ echo=FALSE #' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')` geneLists <- degs %>% # transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_")) transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2)) grpdDegs <- if(split_hit_list){ degs %>% group_by(sample_1, sample_2, sample_1_overex) }else{ degs %>% group_by(sample_1, sample_2) } enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id)) write.delim(enrResults, file="enrResults.txt") # enrResults <- read.delim("enrResults.txt") #' [Enrichment Results](enrResults.txt) sigEnrResults <- subset(enrResults, Bonferroni <0.01) write.delim(sigEnrResults, file="sigEnrResults.txt") # sigEnrResults <- read.delim("sigEnrResults.txt") #' [Very Significant Terms](sigEnrResults.txt) ## plot the enrichment results #sigEnrResults %>% group_by(Category, add=T) %>% do({ # logPlot <- . %>% ggplot(aes(Term, PValue)) + # geom_bar(stat="identity")+coord_flip() + # xlab("Enriched Terms") + # ggtitle(.$Category[1]) + # scale_y_log10() # print(logPlot) #}) sigEnrResults %>% do({ enrResultsGrp <- . label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_") echo("processing", label) logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) + geom_bar(stat="identity")+ coord_flip() + xlab("Enriched Terms") + ggtitle(label) + scale_y_log10() print(logPlot) }) #+ include=FALSE #ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type)) #sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni) #sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term) #ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set) #ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10() #ggsave2(width=14, height=12) #write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t") #+ include=FALSE # ######################################################################################################################## # #' ## Enriched KEGG Pathways # #require(pathview) #require(png) # # #with(sigEnrResults, as.data.frame(table(Category))) # #keggPathways <- sigEnrResults %>% # filter(Category=="KEGG_PATHWAY") %>% # transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description'))) # #if(nrow(keggPathways)==0){ # echo("no enriched kegg pathways were found in the dataset") #} # # #keggPathways %>% rowwise() %>% do({ # curKP <- . # curKP <- keggPathways[1,] # geneData <- merge(curKP, degs) %>% transmute(ensembl_gene_id, log2_fold_change) %>% column2rownames("ensembl_gene_id") # ## prepare gene data # # pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl") # outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "") # ima <- readPNG(outfile) # # plot.new() # lim <- par() # rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4]) # #}) #+ include=FALSE # ######################################################################################################################## # #' ## Protein-Protein Interaction Clusters using String # # ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf # # #' We can use the string-db web service to look for protein-protein interactions. # #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters. # #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates # # #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE # # #taxTable <- list(ENSG=9606, ENSMUSG=10090) # #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]] # # # #require.auto(STRINGdb) # # # #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400 # #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE ) # #clustersList <- string_db$get_clusters(example1_mapped$STRING_id) # #for(network in clustersList) # #{ # # if(length(network)>5) # # { # # string_db$plot_network(network, add_link=FALSE) # # } # #} # #' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`