Newer
Older
#if(!file.exists(r_script)){
# stop(paste("file does not exist\n", doc))
#}
########################################################################################################################
#' Differential gene Expression Analysis
#+ include=FALSE
########################################################################################################################
--compare <sample_pairs> Txt files with sample pairs for which dge analysis should be performed
print(paste("args are ", commandArgs(TRUE)))
opts <- docopt(doc, commandArgs(TRUE))
# opts <- docopt(doc, "..")
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
if(is.na(file.info(cuffdb_directory)$isdir)){
stop(paste("db directory does not exist", doc))
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")
#######################################################################################################################
## workaround until cummerbund is fixed
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
cuff <- readCufflinks(dir=tmpdir)
replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
#######################################################################################################################
# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
fpkmSCVPlot(genes(cuff))
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
#csVolcano(genes(cuff),"a", "b")
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
if(unlen(replicates(cuff)$sample)>2){ ## >2 only, because MDS will fail with just 2 samples
MDSplot(genes(cuff)) + ggtitle("mds plot")
}
MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
PCAplot(genes(cuff),"PC1","PC2",replicates=T)
noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
noTextLayer(csDistHeat(genes(cuff)) + ggtitle("mouse zone correlation"))
noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("mouse replicate correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
csDendro(genes(cuff),replicates=T)
#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()
#######################################################################################################################
#' ## Raw data tables
## merge in basic gene info
#xls(runInfo(cuff))
#setwd("/Volumes/project-zeigerer/Rab5_NGS/results/")
geneFpkm<-fpkm(genes(cuff))
#ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
write.delim(geneFpkm, file="geneFpkm.txt")
# gene.fpkm <- read.delim("gene.fpkm.txt")
repGeneFPKMs <- repFpkmMatrix(genes(cuff))
write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
# repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
# repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
#' [FPKMs by Replicate](repGeneFPKMs.txt)
geneCounts<-count(genes(cuff))
max(geneCounts$count)
write.delim(geneCounts, file="geneCounts.txt")
geneCgene.matrix<-fpkmMatrix(genes(cuff))
#' [Raw Counts](geneCounts.txt)
repCounts <- countMatrix(genes(cuff))
write.delim(repCounts, file="repCounts.txt")
# repCounts <- read.delim("repCounts.txt")
#' [Raw Counts by Replicate](repCounts.txt)
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
#compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst")
if(is.null(opts$sample_pairs)){
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}else{
compPairsUniDir <- read.delim(opts$sample_pairs) %>% set_names("sample_1", "sample_2")
## add other direction for completeness
compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}
## just keep pairs of interest
merge(compPairs) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff))
#allDiff <- transform(allDiff, isHit=p_value<=0.01)
#' Used cutoff criterion was: q_value<0.01
allDiff <- transform(allDiff, isHit=q_value<0.01)
degs <- subset(allDiff, isHit)
#degs <- subset(allDiff, significant=="yes")
degs <- transform(degs, sample_1_overex=log2_fold_change<0)
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## todo move up into table export section
## add gene description
martName <- guess_mart(degs$gene_id)
cacheFile <- paste0("geneInfo.",martName, ".RData")
if(!file.exists(cacheFile)){
mousemart = useDataset(martName, mart=useMart("ensembl"))
geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
}
gFpkmMatrix <- rownames2column(fpkmMatrix(genes(cuff)), "ensembl_gene_id")
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
geneFpkmWithInfo <- merge(geneInfo, gFpkmMatrix, by.x="ensembl_gene_id", all.y=T) %>% print_head()
write.delim(geneFpkmWithInfo, file="geneFpkmWithInfo.txt")
# geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt")
#' [Annotated Expresssion Table](geneFpkmWithInfo.txt)
degs <- merge(degs, geneInfo, by.x="gene_id", by.="ensembl_gene_id", all.x=T)
#' differentially expressed genes
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex)))
#subset(degs, !duplicated(paste(sample_1, sample_2, sample_1_overex, external_gene_name))) %>%
# with(as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>%
# filter(Freq>0)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
#` ## Term enrichment
#+ echo=FALSE
require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/)
geneLists <- degs %>%
transmute(gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D
davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
echo("processing list with", length(someGenes), "genes")
# someGenes <- degs$gene_id
if(length(someGenes)>1500){
someGenes <- sample(someGenes) %>% head(1500)
}
david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")
# getTimeOut(david)
setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")
david %>% setAnnotationCategories(c(
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
"KEGG_PATHWAY",
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT"
))
annoChart <-getFunctionalAnnotationChart(david)
# clusterReport <-getClusterReport(david)
return(annoChart %>% subset(select=-Genes))
}
enrResults <- degs %>% group_by(sample_1, sample_2, sample_1_overex) %>% do(davidAnnotationChart(.$gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
echo("processing", label)
logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
print(logPlot)
})
#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)
#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#########################################################################################################################
##' ## Enriched KEGG Pathways
##+ echo=FALSE, warning=FALSE
#
#require(pathview)
#require(png)
#
#
#with(sigEnrResults, as.data.frame(table(Category)))
#
#keggPathways <- sigEnrResults %>%
# filter(Category=="KEGG_PATHWAY") %>%
# transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
#
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}
#
#
#keggPathways %>% rowwise() %>% do({
# curKP <- .
# curKP <- keggPathways[1,]
# geneData <- merge(curKP, degs) %>% transmute(gene_id, log2_fold_change) %>% column2rownames("gene_id")
# ## prepare gene data
#
# pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
# outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
# ima <- readPNG(outfile)
#
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
#
#})
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606, ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# # if(length(network)>5)
# # {
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#