Newer
Older
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--undirected Perform directed dge-steps in a directed manner
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
#opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05 ..")
# opts <- docopt(doc, "--undirected ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
split_hit_list <- !opts$undirected
constrasts_file <- opts$constrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
if(is.na(file.info(cuffdb_directory)$isdir)){
stop(paste("db directory does not exist", doc))
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")
########################################################################################################################
#' # Differential Gene Expression Analysis
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#print(paste("db dir is", cuffdb_directory))
#+ load_db, cache=FALSE
## workaround until sqlite is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
cuff <- readCufflinks(dir=tmpdir)
replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
write.delim(replicateInfo, file="replicateInfo.txt")
# replicateInfo <- read.delim("replicateInfo.txt")
#' [replicateInfo](replicateInfo.txt)
#######################################################################################################################
#' ## Count and Expression Score Tables
## todo merge in basic gene info into all tables
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
cacheFile <- paste0("geneInfo.",martName, ".RData")
if(!file.exists(cacheFile)){
require(biomaRt)
mousemart = useDataset(martName, mart=useMart("ensembl"))
geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
save(geneInfo, file=cacheFile)
}else{
geneInfo <- local(get(load(cacheFile)))
}
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
fpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
write.delim(file="gene_fpkms.txt")
# gene_fpkms <- read.delim("gene_fpkms.txt")
#' [Annotated Expresssion Table](gene_fpkms.txt)
## export the same but now including replicate information
repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
write.delim(file="gene_fpkms_by_replicate.txt")
# gene_fpkms <- read.delim("gene_fpkms_by_replicate.txt")
#' [Annotated Expresssion Table](gene_fpkms_by_replicate.txt)
fpkm(genes(cuff)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="geneFpkm.txt")
count(genes(cuff)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="gene_counts.txt")
# gene_counts <- read.delim("gene_counts.txt")
#' [Gene Fragment Counts](gene_counts.txt)
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="replicateCounts.txt")
# repCounts <- read.delim("replicateCounts.txt")
#' [Raw Counts by Replicate](replicateCounts.txt)
##ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
#######################################################################################################################
# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
fpkmSCVPlot(genes(cuff))
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
#csVolcano(genes(cuff),"a", "b")
## todo replace or complement with marta plot
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#######################################################################################################################
#' ## Replicate Correlation and Clustering
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
if(unlen(replicates(cuff)$sample)>2){ ## >2 only, because MDS will fail with just 2 samples
MDSplot(genes(cuff)) + ggtitle("mds plot")
}
MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
## todo replace with marta plot
#PCAplot(genes(cuff),"PC1","PC2",replicates=T)
noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
noTextLayer(csDistHeat(genes(cuff)) + ggtitle("Sample correlation"))
noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
csDendro(genes(cuff),replicates=T)
#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
#compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst")
if(!is.null(constrasts_file)){
# if(file.exists(constrasts_file))
echo("using contrasts matrix from: ", basename(constrasts_file))
compPairsUniDir <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
## add other direction for completeness
compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}else{
echo("comparing all samples against each other")
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
## just keep pairs of interest
merge(compPairs) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
mutate(sample_1_overex=log2_fold_change<0)
## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")`
#+ results='asis'
if(!is.null(qcutoff)){
echo("Using q-value cutoff of ", qcutoff)
allDiff <- transform(allDiff, isHit=q_value<=qcutoff)
}else{
echo("Using p-value cutoff of ", pcutoff)
allDiff <- transform(allDiff, isHit=p_value<=pcutoff)
}
#+
# #' Used cutoff criterion was: q_value<0.01
# allDiff <- transform(allDiff, isHit=q_value<0.01)
degs <- subset(allDiff, isHit)
#degs <- subset(allDiff, significant=="yes")
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
## add gene info
degs <- merge(degs, geneInfo, by="ensembl_gene_id", all.x=T)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
ontologies <- c(
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
"KEGG_PATHWAY",
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT"
)
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/)
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D
davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
# echo("processing list with", length(someGenes), "genes")
if(length(someGenes)>1500){
someGenes <- sample(someGenes) %>% head(1500)
}
david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")
# getTimeOut(david)
setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")
annoChart <-getFunctionalAnnotationChart(david)
# clusterReport <-getClusterReport(david)
return(annoChart %>% subset(select=-Genes))
}
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
echo("processing", label)
logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
print(logPlot)
})
#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)
#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
#+ include=FALSE
# ########################################################################################################################
#
#require(pathview)
#require(png)
#
#
#with(sigEnrResults, as.data.frame(table(Category)))
#
#keggPathways <- sigEnrResults %>%
# filter(Category=="KEGG_PATHWAY") %>%
# transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
#
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}
#
#
#keggPathways %>% rowwise() %>% do({
# curKP <- .
# curKP <- keggPathways[1,]
# geneData <- merge(curKP, degs) %>% transmute(ensembl_gene_id, log2_fold_change) %>% column2rownames("ensembl_gene_id")
# ## prepare gene data
#
# pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
# outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
# ima <- readPNG(outfile)
#
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
#
#})
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606, ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# # if(length(network)>5)
# # {
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#
#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`