Skip to content
Snippets Groups Projects
Commit bfbb3ba6 authored by Holger Brandl's avatar Holger Brandl
Browse files

started improved dge report

parent 79277551
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env Rscript
#+ setup, cache=FALSE
suppressMessages(require(docopt))
doc <- '
Convert a cuffdiff into a dge-report
Usage: dge_analysis.R [options] <cuffdb_directory>
Options:
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--undirected Perform directed dge-steps in a directed manner
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--minfpkm <minfpkm> Minimal expression in any of the sample to be included in the final result list [default: 1]
-S Dont do general statistics and qc analysis
'
opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# opts <- docopt(doc, "--undirected ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
## todo use commandArgs here!!
#opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
#opts
skipQC <<- opts$S
split_hit_list <- !opts$undirected
constrasts_file <- opts$constrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
minFPKM <- as.numeric(opts$minfpkm)
stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
if(is.na(file.info(cuffdb_directory)$isdir)){
stop(paste("db directory does not exist", doc))
}
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/bio/diffex_commons.R")
require(cummeRbund)
require(knitr)
#knitr::opts_knit$set(root.dir = getwd())
########################################################################################################################
#' # Differential Gene Expression Analysis
print("Configuration")
vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#print(paste("db dir is", cuffdb_directory))
#' Used cuffdiff database in `r cuffdb_directory`
#+ load_db, cache=FALSE
isCluster=Sys.getenv("LSF_SERVERDIR")!=""
dbTempCopyRequired=isCluster | str_detect(normalizePath(cuffdb_directory), fixed("/Volumes/projects"))
if(dbTempCopyRequired){
## workaround until sqlite is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
cuff <- readCufflinks(dir=tmpdir)
}else{
cuff <- readCufflinks(dir="..")
}
cuff
runInfo(cuff)
replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
replicateInfo
write.delim(replicateInfo, file="replicate_info.txt")
# replicateInfo <- read.delim("replicate_info.txt")
#' [replicateInfo](replicate_info.txt)
## add gene description
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
cacheFile <- paste0("geneInfo.",martName, ".RData")
if(!file.exists(cacheFile)){
require(biomaRt)
mousemart = useDataset(martName, mart=useMart("ensembl"))
geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
save(geneInfo, file=cacheFile)
unloadNamespace('biomaRt')
}else{
geneInfo <- local(get(load(cacheFile)))
}
#+ eval=skipQC
#hello hidden field
#######################################################################################################################
#' ## Count and Expression Score Tables
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
fpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
write.delim(file="gene_fpkms.txt")
# gene_fpkms <- read.delim("gene_fpkms.txt")
#' [Annotated Expresssion Table](gene_fpkms.txt)
## export the same but now including replicate information
repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
write.delim(file="gene_fpkms_by_replicate.txt")
# gene_fpkms_by_replicate <- read.delim("gene_fpkms_by_replicate.txt")
#' [Annotated Expresssion Table](gene_fpkms_by_replicate.txt)
## also export count tables
countMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="gene_counts.txt")
#' [Gene Counts](gene_counts.txt)
repCountMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="gene_counts_by_replicate.txt")
# repCounts <- read.delim("replicateCounts.txt")
#' [Gene Counts by Replicate](replicateCounts.txt)
##ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
#######################################################################################################################
#' ## Score Distributions
#if(!as.logical(opts$S)){
# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
fpkmSCVPlot(genes(cuff))
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
## todo consider to add more pretty volcano plots (see stefania's extra plots
#csVolcano(genes(cuff),"a", "b")
## todo replace or complement with marta plot
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#######################################################################################################################
#' ## Replicate Correlation and Clustering
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
if(unlen(replicates(cuff)$sample)>2){ ## >2 only, because MDS will fail with just 2 samples
MDSplot(genes(cuff)) + ggtitle("mds plot")
}
MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
## todo replace with marta plot
#PCAplot(genes(cuff),"PC1","PC2",replicates=T)
noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
## todo use fix scaled here
noTextLayer(csDistHeat(genes(cuff)) + ggtitle("Sample correlation"))
noTextLayer(csDistHeat(genes(cuff),replicates=T) + ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
csDendro(genes(cuff),replicates=T)
#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()
#' Also cluster expression filtered data
exprGenes <- repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
melt(value.name="fpkm") %>%
group_by(ensembl_gene_id) %>%
## apply log trafo
mutate(log_fpkm=log10(fpkm+1)) %>%
## apply expression filter
filter(max(log_fpkm)>1) %>%
dcast( ensembl_gene_id ~ variable, value.var="log_fpkm") %>%
column2rownames("ensembl_gene_id")
res<-JSdist(makeprobs(exprGenes)) %>% hclust() %>% as.dendrogram()
par(mar=c(10,4,4,4))
plot(res, main="Replicate Clustering of Expressed Genes")
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
if(!is.null(constrasts_file)){
echo("using contrasts matrix from: ", basename(constrasts_file))
compPairsUniDir <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
## add other direction for completeness
compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}else{
echo("comparing all samples against each other")
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}
allDiff <- diffData(genes(cuff)) %>%
## just keep pairs of interest
merge(compPairs) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff, minFPKM=minFPKM)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
mutate(sample_1_overex=log2_fold_change<0)
#getExpressedGenes(cuff, minFPKM=minFPKM) %>% writeLines(paste0("genes_expressed_gt_", minFPKM[1], "_in_at_least_one_sample.txt"))
# diffData <- read.delim("diffData.txt")
#' [diffData](diffData.txt)
## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")`
#+ results='asis'
if(!is.null(qcutoff)){
echo("Using q-value cutoff of", qcutoff)
allDiff <- transform(allDiff, is_hit=q_value<=qcutoff)
}else{
echo("Using p-value cutoff of", pcutoff)
allDiff <- transform(allDiff, is_hit=p_value<=pcutoff)
}
## save as reference
merge(allDiff, geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="diffData.txt")
#+
# #' Used cutoff criterion was: q_value<0.01
# allDiff <- transform(allDiff, is_hit=q_value<0.01)
degs <- subset(allDiff, is_hit)
#degs <- subset(allDiff, significant=="yes")
## todo embedd as paginated DataTable
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
## add gene info
degs <- merge(degs, geneInfo, by="ensembl_gene_id", all.x=T)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
#' ## Term enrichment
#+ echo=FALSE
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <-
dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
head(1) %>% apply(1, paste, collapse="_vs_")
echo("processing", label)
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
ggsave(paste0(label, " enrichmed terms.pdf"))
print(logPlot)
})
#ggsave2()
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)
#write.table(sigEnrResults, file=paste0("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
#with(sigEnrResults, as.data.frame(table(Category)))
keggPathways <- sigEnrResults %>%
filter(Category=="KEGG_PATHWAY") %>%
separate(Term, c('kegg_pathway_id', 'pathway_description'), sep="\\:", remove=F) %>%
with(kegg_pathway_id) %>% ac()
########################################################################################################################
#' ## Enriched KEGG Pathways
require(pathview)
require(png)
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}else{
keggOrCode <- "mmu"
## prepare p-value data
sliceData <- allDiff %>%
mutate(
## define slice labels (needed for casting)
comparison=paste0(sample_1, "__vs__", sample_2),
## create normalized linear score for slice plotting
sample_1_overex=log2_fold_change<0,
plot_score=-log10(q_value)*ifelse(sample_1_overex, 1, -1)
) %>%
dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
column2rownames("ensembl_gene_id")
#sliceData %>% head %>% kable()
data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable()
plot_pathway <- function(pathwayID, overlayData){
# pathwayID="mmu04015"
# browser()
echo("processing pathway", pathwayID)
pv.out <- pathview(
gene.data = overlayData,
pathway.id = pathwayID,
species = keggOrCode,
# out.suffix = pathwayID$kegg.description,
# out.suffix = pathwayID,
multi.state = ncol(overlayData)>1,
# kegg.native=F,
# node.sum = "mean", # the method name to calculate node summary given that multiple genes or compounds are mapped to it
limit = list(gene = c(-4,4)),
gene.idtype="ensembl"
)
outfile <- paste0(pathwayID, ".pathview", ifelse(ncol(overlayData)>1, ".multi", ""), ".png")
## interactive plotting
# ima <- readPNG(outfile)
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
pv.out$plotfile=outfile
pv.out$pathway_id=pathwayID
return(pv.out)
}
#keggPathways <- c("mmu04976", "mmu04972", "mmu04810", "mmu04520", "mmu04530", "mmu04270", "mmu04015")
#keggPathways <- c("mmu04015")
pathwayPlots <- keggPathways %>% llply(function(pathwayID){
plot_pathway(pathwayID, sliceData)
})
save(pathwayPlots, file=".pathwayPlots.RData")
# pathwayPlots <- local(get(load("pathwayPlots.RData")))
#+ results="asis"
## simple non-clickable plots
## http://stackoverflow.com/questions/12588323/r-how-to-extract-values-for-the-same-named-elements-across-multiple-objects-of
#unlist(lapply(pathwayPlots, "[[", "plotfile"))
# unlist(lapply(pathwayPlots, "[[", "plotfile")) %>% l_ply(function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
#cat("
#<style type='text/css'>
# img {
# max-width: 100%;
# }
#</style>
#")
## todo add tooltips with additinal info
## http://stackoverflow.com/questions/5478940/how-to-create-a-html-tooltip-on-top-of-image-map
## extended version with clickable links
pathwayPlots %>% l_ply(function(plotData){
#pngFile="mmu04015.pathview.png"
# pathway_id=(basename(pngFile) %>% str_split_fixed("[.]", 2))[,1]
pngFile <- plotData$plotfile
pathway_id=plotData$pathway_id
keggNodes <- plotData$plot.data.gene %>%
## remove box offset
mutate(x=x-22, y=y-8) %>% ## todo does not work for non-gene elements
mutate(link=paste0("http://www.kegg.jp/dbget-bin/www_bget?", keggOrCode, ":", kegg.names ))
# "http://www.kegg.jp/dbget-bin/www_bget?mmu:16412
## first the image itself
# cat(paste0("<img usemap='#", pathway_id,"' src='", pngFile, "'><br>"))
cat(paste0("<div style='width: 2000px'><img style='float: left' usemap='#", pathway_id,"' src='", pngFile, "'></div><br>"))
cat(paste0("<map name='", pathway_id,"'>"))
#keggNodes %>% rowwise() %>% {curNode=.; cat(curNode$name)}
#see http://www.html-world.de/180/image-map/
keggNodes %>% a_ply(1, function(curNode){
rectDef=with(curNode, paste(x, y, x+width, y+height, sep=","))
paste0("<area href='", curNode$link,"' alt='Text' coords='",rectDef , "' shape='rect'>") %>% cat()
})
cat("</map>")
})
#}
#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606, ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# # if(length(network)>5)
# # {
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#
#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment