Newer
Older
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--undirected Perform directed dge-steps in a directed manner
#opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
#opts
skipQC <<- opts$S
split_hit_list <- !opts$undirected
constrasts_file <- opts$constrasts
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
if(is.na(file.info(cuffdb_directory)$isdir)){
stop(paste("db directory does not exist", doc))
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")
########################################################################################################################
#' # Differential Gene Expression Analysis
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#print(paste("db dir is", cuffdb_directory))
#+ load_db, cache=FALSE
## workaround until sqlite is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
cuff <- readCufflinks(dir=tmpdir)
replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
#######################################################################################################################
#' ## Count and Expression Score Tables
## todo merge in basic gene info into all tables
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
cacheFile <- paste0("geneInfo.",martName, ".RData")
if(!file.exists(cacheFile)){
require(biomaRt)
mousemart = useDataset(martName, mart=useMart("ensembl"))
geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
save(geneInfo, file=cacheFile)
}else{
geneInfo <- local(get(load(cacheFile)))
}
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
fpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by.x="ensembl_gene_id", all.x=T) %>%
write.delim(file="geneFpkmWithInfo.txt")
# geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt")
#' [Annotated Expresssion Table](geneFpkmWithInfo.txt)
## export the same but now including replicate information
repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by.x="ensembl_gene_id", all.x=T) %>%
write.delim(file="geneReplicateFpkmWithInfo.txt")
# geneFpkmWithInfo <- read.delim("geneReplicateFpkmWithInfo.txt")
#' [Annotated Expresssion Table](geneReplicateFpkmWithInfo.txt)
#+ eval=FALSE, echo=FALSE
## DEBUG start
if(F){
read.delim("geneReplicateFpkmWithInfo.txt") %>%
filter(external_gene_name=="Insm1") %>%
melt() %>%
ggplot(aes(variable, value)) + geom_bar(stat="identity")
}
## DEBUG end
geneFpkm<-fpkm(genes(cuff))
#ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
write.delim(geneFpkm, file="geneFpkm.txt")
# gene.fpkm <- read.delim("gene.fpkm.txt")
write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
# repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
# repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
#' [FPKMs by Replicate](repGeneFPKMs.txt)
geneCounts<-count(genes(cuff))
#max(geneCounts$count)
write.delim(geneCounts, file="geneCounts.txt")
geneCgene.matrix<-fpkmMatrix(genes(cuff))
#' [Raw Counts](geneCounts.txt)
repCounts <- countMatrix(genes(cuff))
write.delim(repCounts, file="repCounts.txt")
# repCounts <- read.delim("repCounts.txt")
#' [Raw Counts by Replicate](repCounts.txt)
## tt
#######################################################################################################################
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
fpkmSCVPlot(genes(cuff))
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
#csVolcano(genes(cuff),"a", "b")
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#######################################################################################################################
#' ## Replicate Correlation and Clustering
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
if(unlen(replicates(cuff)$sample)>2){ ## >2 only, because MDS will fail with just 2 samples
MDSplot(genes(cuff)) + ggtitle("mds plot")
}
MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
PCAplot(genes(cuff),"PC1","PC2",replicates=T)
noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
noTextLayer(csDistHeat(genes(cuff)) + ggtitle("mouse zone correlation"))
noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("mouse replicate correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
csDendro(genes(cuff),replicates=T)
#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
#compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst")
if(!is.null(constrasts_file)){
# if(file.exists(constrasts_file))
echo("using contrasts matrix from: ", basename(constrasts_file))
compPairsUniDir <- read.delim(opts$sample_pairs) %>% set_names("sample_1", "sample_2")
## add other direction for completeness
compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}else{
echo("comparing all samples against each other")
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
## just keep pairs of interest
merge(compPairs) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff))
#' Used cutoff criterion was: p_value<0.01
allDiff <- transform(allDiff, isHit=p_value<=0.01)
##' Used cutoff criterion was: q_value<0.01
#allDiff <- transform(allDiff, isHit=q_value<0.01)
degs <- subset(allDiff, isHit)
#degs <- subset(allDiff, significant=="yes")
degs <- transform(degs, sample_1_overex=log2_fold_change<0)
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
degs <- merge(degs, geneInfo, by.x="gene_id", by.="ensembl_gene_id", all.x=T)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
ontologies <- c(
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
"KEGG_PATHWAY",
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT"
)
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=',')`
require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/)
geneLists <- degs %>%
transmute(gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D
davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
# echo("processing list with", length(someGenes), "genes")
# someGenes <- degs$gene_id
if(length(someGenes)>1500){
someGenes <- sample(someGenes) %>% head(1500)
}
david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")
# getTimeOut(david)
setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
annoChart <-getFunctionalAnnotationChart(david)
# clusterReport <-getClusterReport(david)
return(annoChart %>% subset(select=-Genes))
}
enrResults <- degs %>% group_by(sample_1, sample_2, sample_1_overex) %>% do(davidAnnotationChart(.$gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
echo("processing", label)
logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
print(logPlot)
})
#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)
#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
#+ include=FALSE
# ########################################################################################################################
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#
#require(pathview)
#require(png)
#
#
#with(sigEnrResults, as.data.frame(table(Category)))
#
#keggPathways <- sigEnrResults %>%
# filter(Category=="KEGG_PATHWAY") %>%
# transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
#
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}
#
#
#keggPathways %>% rowwise() %>% do({
# curKP <- .
# curKP <- keggPathways[1,]
# geneData <- merge(curKP, degs) %>% transmute(gene_id, log2_fold_change) %>% column2rownames("gene_id")
# ## prepare gene data
#
# pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
# outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
# ima <- readPNG(outfile)
#
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
#
#})
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606, ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# # if(length(network)>5)
# # {
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#
#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`