dge_analysis.R

#!/usr/bin/env Rscript
#+ setup, cache=FALSE

suppressMessages(require(docopt))

doc <- '
Convert a cuffdiff into a dge-report
Usage: dge_analysis.R [options] <cuffdb_directory>

Options:
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--undirected           Perform directed dge-steps in a directed manner
--qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
-S                     Dont do general statistics and qc analysis
'

#opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05  ..")
# opts <- docopt(doc, "--undirected  ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))

opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
#opts

skipQC <<- opts$S
split_hit_list <- !opts$undirected
constrasts_file <- opts$constrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)

stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))

cuffdb_directory <- normalizePath(opts$cuffdb_directory)

if(is.na(file.info(cuffdb_directory)$isdir)){
    stop(paste("db directory does not exist", doc))
}


devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")

require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")

#knitr::opts_knit$set(root.dir = getwd())

########################################################################################################################
#' # Differential Gene Expression Analysis

## todo Small overview about pipeline

#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).

#print(paste("db dir is", cuffdb_directory))

#' Used cuffdiff database in `r cuffdb_directory`

#+ load_db, cache=FALSE

## workaround until sqlite  is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))

cuff <- readCufflinks(dir=tmpdir)

cuff
runInfo(cuff)
replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
replicateInfo

write.delim(replicateInfo, file="replicateInfo.txt")
# replicateInfo <- read.delim("replicateInfo.txt")
#'  [replicateInfo](replicateInfo.txt)

#+ eval=skipQC
#hello hidden field

#######################################################################################################################
#' ## Count and Expression Score Tables

## todo merge in basic gene info into all tables

## add gene description
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
cacheFile <- paste0("geneInfo.",martName, ".RData")
if(!file.exists(cacheFile)){
    require(biomaRt)

    mousemart = useDataset(martName, mart=useMart("ensembl"))
    geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
    save(geneInfo, file=cacheFile)
    unloadNameSpace(biomaRt)
}else{
    geneInfo <- local(get(load(cacheFile)))
}


#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
fpkmMatrix(genes(cuff)) %>%
    rownames2column("ensembl_gene_id") %>%
    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
    print_head() %>%
    write.delim(file="gene_fpkms.txt")
# gene_fpkms <- read.delim("gene_fpkms.txt")
#' [Annotated Expresssion Table](gene_fpkms.txt)

## export the same but now including replicate information

repFpkmMatrix(genes(cuff)) %>%
    rownames2column("ensembl_gene_id") %>%
    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
    print_head() %>%
    write.delim(file="gene_fpkms_by_replicate.txt")
# gene_fpkms <- read.delim("gene_fpkms_by_replicate.txt")
#' [Annotated Expresssion Table](gene_fpkms_by_replicate.txt)

fpkm(genes(cuff)) %>%
    dplyr::rename(ensembl_gene_id=gene_id) %>%
    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
    write.delim(file="geneFpkm.txt")


## also export count tables

count(genes(cuff)) %>%
    dplyr::rename(ensembl_gene_id=gene_id) %>%
    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
    write.delim(file="gene_counts.txt")
# gene_counts <- read.delim("gene_counts.txt")
#'  [Gene Fragment Counts](gene_counts.txt)

countMatrix(genes(cuff)) %>%
    rownames2column("ensembl_gene_id") %>%
    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
    write.delim(file="replicateCounts.txt")
# repCounts <- read.delim("replicateCounts.txt")
#'  [Raw Counts by Replicate](replicateCounts.txt)

##ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()


#######################################################################################################################
#' ## Score Distributions


#if(!as.logical(opts$S)){

# create some global summary statistic plots
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)

csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))

csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)

fpkmSCVPlot(genes(cuff))


#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))

#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")


#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")


#csVolcano(genes(cuff),"a", "b")
## todo replace or complement with marta plot
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")

#######################################################################################################################
#' ## Replicate Correlation and Clustering

require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")


if(unlen(replicates(cuff)$sample)>2){  ## >2 only, because MDS will fail with just 2 samples
    MDSplot(genes(cuff)) + ggtitle("mds plot")
}

MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")

## todo replace with marta plot
#PCAplot(genes(cuff),"PC1","PC2",replicates=T)

noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}


## todo use fix scaled here
noTextLayer(csDistHeat(genes(cuff)) +  ggtitle("Sample correlation"))

noTextLayer(csDistHeat(genes(cuff),replicates=T) +  ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))

csDendro(genes(cuff),replicates=T)

#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
#csDendro(genes(cuff),replicates=T)
#dev.off()


#######################################################################################################################
#' ## Extract lists of differentially expressed genes


#compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst")

if(!is.null(constrasts_file)){
#    if(file.exists(constrasts_file))
    echo("using contrasts matrix from: ", basename(constrasts_file))

    compPairsUniDir <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))

    ## add other direction for completeness
    compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}else{
    echo("comparing all samples against each other")
    compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}


allDiff <- diffData(genes(cuff)) %>%
    ## just keep pairs of interest
    merge(compPairs) %>%

    ##  discard all genes that are not expressed in any of the cell types (>1)
    filter(gene_id %in% getExpressedGenes(cuff)) %>%
    dplyr::rename(ensembl_gene_id=gene_id) %>%
    mutate(sample_1_overex=log2_fold_change<0)


## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")`

#+ results='asis'
if(!is.null(qcutoff)){
    echo("Using q-value cutoff of ", qcutoff)
    allDiff <- transform(allDiff, isHit=q_value<=qcutoff)
}else{
    echo("Using p-value cutoff of ", pcutoff)
    allDiff <- transform(allDiff, isHit=p_value<=pcutoff)
}


#+
# #' Used cutoff criterion was: q_value<0.01
# allDiff <- transform(allDiff, isHit=q_value<0.01)


degs <- subset(allDiff, isHit)
#degs <- subset(allDiff, significant=="yes")


with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)

## add gene info
degs <- merge(degs, geneInfo, by="ensembl_gene_id", all.x=T)


write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)

ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()

ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()

## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))


########################################################################################################################
#' ## Term enrichment
#+ echo=FALSE

#fpkmMatrix(genes(cuff)) %>% head()

ontologies <- c(
    "GOTERM_CC_FAT",
    "GOTERM_MF_FAT",
    "GOTERM_BP_FAT",

    "PANTHER_PATHWAY",
    "REACTOME_PATHWAY",
    "KEGG_PATHWAY",

    "GOTERM_CC_FAT",
    "GOTERM_MF_FAT",
    "GOTERM_BP_FAT"
)

#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`

require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/)

geneLists <- degs %>%
#    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))


## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D


davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
#    echo("processing list with", length(someGenes), "genes")
#    someGenes <- degs$ensembl_gene_id


    if(length(someGenes)>1500){
        someGenes <- sample(someGenes) %>% head(1500)
    }

    david<-DAVIDWebService$new(email="brandl@mpi-cbg.de")

#    getTimeOut(david)
    setTimeOut(david, 80000) ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf

    result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")

    david %>% setAnnotationCategories(ontologies)

    annoChart <-getFunctionalAnnotationChart(david)

#    clusterReport <-getClusterReport(david)

    return(annoChart %>% subset(select=-Genes))
}


grpdDegs <- if(split_hit_list){
    degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
    degs %>% group_by(sample_1, sample_2)
}

enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))

write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#'  [Enrichment Results](enrResults.txt)

sigEnrResults <- subset(enrResults, Bonferroni <0.01)

write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#'  [Very Significant Terms](sigEnrResults.txt)


## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
#    logPlot <- . %>% ggplot(aes(Term, PValue)) +
#	    geom_bar(stat="identity")+coord_flip() +
#	    xlab("Enriched Terms") +
#	    ggtitle(.$Category[1]) +
#	    scale_y_log10()
#	    print(logPlot)
#})

sigEnrResults %>% do({
    enrResultsGrp <- .
    label <- apply(enrResultsGrp[1,1:3], 1, paste, collapse="_")
    echo("processing", label)

    logPlot <- enrResultsGrp %>% ggplot(aes(reorder(Term, as.integer(Category)), PValue, fill=Category)) +
	    geom_bar(stat="identity")+
	    coord_flip() +
	    xlab("Enriched Terms") +
	    ggtitle(label) +
	    scale_y_log10()

	    print(logPlot)
})

#+ include=FALSE
#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))

#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)

#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
#ggsave2(width=14, height=12)

#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE,  sep="\t")


#+ include=FALSE
# ########################################################################################################################
# #' ## Enriched KEGG Pathways
#
#require(pathview)
#require(png)
#
#
#with(sigEnrResults, as.data.frame(table(Category)))
#
#keggPathways <- sigEnrResults %>%
#    filter(Category=="KEGG_PATHWAY") %>%
#    transform(kegg = colsplit(Term, "\\:", names = c('pathway_id', 'description')))
#
#if(nrow(keggPathways)==0){
#    echo("no enriched kegg pathways were found in the dataset")
#}
#
#
#keggPathways %>% rowwise() %>% do({
#    curKP <- .
#    curKP <- keggPathways[1,]
#    geneData <- merge(curKP, degs) %>% transmute(ensembl_gene_id, log2_fold_change) %>% column2rownames("ensembl_gene_id")
#    ## prepare gene data
#
#    pv.out <- pathview(gene.data = geneData, pathway.id = curKP$kegg.pathway_id, species = "mmu", out.suffix = curKP$kegg.description, node.sum = "mean", limit = list(gene = 3, cpd = 3), gene.idtype="ensembl")
#        outfile <- paste(pathway_ids[i], ".", pathway_names[i], ".png", sep = "")
#        ima <- readPNG(outfile)
#
#    	plot.new()
#    	lim <- par()
#    	rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
#
#})


#+ include=FALSE
# ########################################################################################################################
# #' ## Protein-Protein Interaction Clusters using String
#
# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
#
# #' We can use the string-db web service to look for protein-protein interactions.
# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
#
# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
#
# #taxTable <- list(ENSG=9606,  ENSMUSG=10090)
# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
# #
# #require.auto(STRINGdb)
# #
# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
# #for(network in clustersList)
# #{
# #	if(length(network)>5)
# #	{
# #		string_db$plot_network(network, add_link=FALSE)
# #	}
# #}
#

#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`