Newer
Older
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--undirected Perform directed dge-steps in a directed manner
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
#opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05 ..")
# opts <- docopt(doc, "--undirected ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
split_hit_list <- !opts$undirected
constrasts_file <- opts$constrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
if(is.na(file.info(cuffdb_directory)$isdir)){
stop(paste("db directory does not exist", doc))
}
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
########################################################################################################################
#' # Differential Gene Expression Analysis
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#print(paste("db dir is", cuffdb_directory))
#+ load_db, cache=FALSE
## workaround until sqlite is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
cuff <- readCufflinks(dir=tmpdir)
replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
write.delim(replicateInfo, file="replicateInfo.txt")
# replicateInfo <- read.delim("replicateInfo.txt")
#' [replicateInfo](replicateInfo.txt)
#######################################################################################################################
#' ## Count and Expression Score Tables
## todo merge in basic gene info into all tables
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
cacheFile <- paste0("geneInfo.",martName, ".RData")
if(!file.exists(cacheFile)){
require(biomaRt)
mousemart = useDataset(martName, mart=useMart("ensembl"))
geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
save(geneInfo, file=cacheFile)
}else{
geneInfo <- local(get(load(cacheFile)))
}
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
fpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
write.delim(file="gene_fpkms.txt")
# gene_fpkms <- read.delim("gene_fpkms.txt")
#' [Annotated Expresssion Table](gene_fpkms.txt)
## export the same but now including replicate information
repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
write.delim(file="gene_fpkms_by_replicate.txt")
# gene_fpkms <- read.delim("gene_fpkms_by_replicate.txt")
#' [Annotated Expresssion Table](gene_fpkms_by_replicate.txt)
fpkm(genes(cuff)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="geneFpkm.txt")
count(genes(cuff)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="gene_counts.txt")
# gene_counts <- read.delim("gene_counts.txt")
#' [Gene Fragment Counts](gene_counts.txt)
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="replicateCounts.txt")
# repCounts <- read.delim("replicateCounts.txt")
#' [Raw Counts by Replicate](replicateCounts.txt)
##ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
#######################################################################################################################
# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
fpkmSCVPlot(genes(cuff))
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
#csVolcano(genes(cuff),"a", "b")
## todo replace or complement with marta plot
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#######################################################################################################################
#' ## Replicate Correlation and Clustering
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
if(unlen(replicates(cuff)$sample)>2){ ## >2 only, because MDS will fail with just 2 samples
MDSplot(genes(cuff)) + ggtitle("mds plot")
}
MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
## todo replace with marta plot
#PCAplot(genes(cuff),"PC1","PC2",replicates=T)
noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
noTextLayer(csDistHeat(genes(cuff)) + ggtitle("Sample correlation"))
noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
csDendro(genes(cuff),replicates=T)
#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
#compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst")
if(!is.null(constrasts_file)){
# if(file.exists(constrasts_file))
echo("using contrasts matrix from: ", basename(constrasts_file))
compPairsUniDir <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
## add other direction for completeness
compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}else{
echo("comparing all samples against each other")
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
## just keep pairs of interest
merge(compPairs) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
mutate(sample_1_overex=log2_fold_change<0)
## save as reference
merge(allDiff, geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="diffData.txt")
# diffData <- read.delim("diffData.txt")
#' [diffData](diffData.txt)
## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")`
#+ results='asis'
if(!is.null(qcutoff)){
echo("Using q-value cutoff of ", qcutoff)
allDiff <- transform(allDiff, isHit=q_value<=qcutoff)
}else{
echo("Using p-value cutoff of ", pcutoff)
allDiff <- transform(allDiff, isHit=p_value<=pcutoff)
}
#+
# #' Used cutoff criterion was: q_value<0.01
# allDiff <- transform(allDiff, isHit=q_value<0.01)
degs <- subset(allDiff, isHit)
#degs <- subset(allDiff, significant=="yes")
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
## add gene info
degs <- merge(degs, geneInfo, by="ensembl_gene_id", all.x=T)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
echo("processing", label)
logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
print(logPlot)
})
#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)
#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
#+ include=FALSE
# ########################################################################################################################
#
#require(pathview)
#require(png)
#
#
#with(sigEnrResults, as.data.frame(table(Category)))
#
#keggPathways <- sigEnrResults %>%
# filter(Category=="KEGG_PATHWAY") %>%
# transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
#
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}
#
#
#keggPathways %>% rowwise() %>% do({
# curKP <- .
# curKP <- keggPathways[1,]
# geneData <- merge(curKP, degs) %>% transmute(ensembl_gene_id, log2_fold_change) %>% column2rownames("ensembl_gene_id")
# ## prepare gene data
#
# pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
# outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
# ima <- readPNG(outfile)
#
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
#
#})
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606, ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# # if(length(network)>5)
# # {
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#
#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`