Skip to content
Snippets Groups Projects
Commit a34ab359 authored by Holger Brandl's avatar Holger Brandl
Browse files

cont. dge workflow

parent 6d816ada
No related branches found
No related tags found
No related merge requests found
......@@ -79,6 +79,6 @@ ggplot(algnSummary, aes(condition, mapped_reads)) +
#' ## Alignment Correlation
#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
#' Used tool (deepTools)[https://github.com/fidelram/deepTools]
#' Used tool [deepTools](https://github.com/fidelram/deepTools)
### todo integrate bam correlation analysis
\ No newline at end of file
#!/usr/bin/env Rscript
#if(!file.exists(r_script)){
# stop(paste("file does not exist\n", doc))
#}
########################################################################################################################
#+ include=FALSE
##' # Differential Gene Expression Analysis
##' TBD: Small overview about pipeline
########################################################################################################################
#+ setup
#+ setup, cache=FALSE
suppressMessages(require(docopt))
doc <- '
Convert a cuffdiff into a dge-report
Usage: dge_analysis.R [-S] <cuffdb_directory>
Usage: dge_analysis.R [options] <cuffdb_directory>
Options:
--compare <sample_pairs> Txt files with sample pairs for which dge analysis should be performed
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--undirected Perform directed dge-steps in a directed manner
-S Dont do general statistics and qc analysis
--undirected Perform directed dge-steps in a directed manner'
'
#opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
......@@ -31,9 +22,8 @@ opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), col
#opts
skipQC <<- opts$S
directedDgeSets <- !opts$undirected
#print("options were:")
#print(opts)
split_hit_list <- !opts$undirected
constrasts_file <- opts$constrasts
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
......@@ -48,19 +38,22 @@ devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/
require(cummeRbund)
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/cummerutils.R")
#knitr::opts_knit$set(root.dir = getwd())
#######################################################################################################################
#' # Cuffdiff DB
#+ load_db, cache=FALSE
########################################################################################################################
#' # Differential Gene Expression Analysis
print(paste("db dir is", cuffdb_directory))
## todo Small overview about pipeline
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#print(paste("db dir is", cuffdb_directory))
#' Used cuffdiff database in `r cuffdb_directory`
## workaround until cummerbund is fixed
#+ load_db, cache=FALSE
## workaround until sqlite is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
......@@ -73,6 +66,49 @@ replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_
#+ eval=skipQC
#hello hidden field
#######################################################################################################################
#' ## Count and Expression Score Tables
## todo merge in basic gene info
fpkmMatrix(genes(cuff)) %>% head()
gFpkmMatrix <- rownames2column(fpkmMatrix(genes(cuff)), "ensembl_gene_id")
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
geneFpkmWithInfo <- merge(geneInfo, gFpkmMatrix, by.x="ensembl_gene_id", all.y=T) #%>% print_head()
write.delim(geneFpkmWithInfo, file="geneFpkmWithInfo.txt")
# geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt")
#' [Annotated Expresssion Table](geneFpkmWithInfo.txt)
geneFpkm<-fpkm(genes(cuff))
#ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
write.delim(geneFpkm, file="geneFpkm.txt")
# gene.fpkm <- read.delim("gene.fpkm.txt")
#' [FPKMs](geneFpkm.txt)
repGeneFPKMs <- repFpkmMatrix(genes(cuff))
write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
# repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
# repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
#' [FPKMs by Replicate](repGeneFPKMs.txt)
geneCounts<-count(genes(cuff))
#max(geneCounts$count)
write.delim(geneCounts, file="geneCounts.txt")
geneCgene.matrix<-fpkmMatrix(genes(cuff))
#' [Raw Counts](geneCounts.txt)
repCounts <- countMatrix(genes(cuff))
write.delim(repCounts, file="repCounts.txt")
# repCounts <- read.delim("repCounts.txt")
#' [Raw Counts by Replicate](repCounts.txt)
## tt
#######################################################################################################################
#' ## Score Distributions
......@@ -133,36 +169,6 @@ csDendro(genes(cuff),replicates=T)
#dev.off()
#######################################################################################################################
#' ## Raw data tables
## todo merge in basic gene info
geneFpkm<-fpkm(genes(cuff))
#ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
write.delim(geneFpkm, file="geneFpkm.txt")
# gene.fpkm <- read.delim("gene.fpkm.txt")
#' [FPKMs](geneFpkm.txt)
repGeneFPKMs <- repFpkmMatrix(genes(cuff))
write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
# repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
# repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
#' [FPKMs by Replicate](repGeneFPKMs.txt)
geneCounts<-count(genes(cuff))
#max(geneCounts$count)
write.delim(geneCounts, file="geneCounts.txt")
geneCgene.matrix<-fpkmMatrix(genes(cuff))
#' [Raw Counts](geneCounts.txt)
repCounts <- countMatrix(genes(cuff))
write.delim(repCounts, file="repCounts.txt")
# repCounts <- read.delim("repCounts.txt")
#' [Raw Counts by Replicate](repCounts.txt)
#######################################################################################################################
#' ## Extract lists of differentially expressed genes
......@@ -170,13 +176,17 @@ write.delim(repCounts, file="repCounts.txt")
#compPairsUniDir <- data.frame(sample_1="big_cyst", sample_2="small_cyst")
if(is.null(opts$sample_pairs)){
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}else{
if(!is.null(constrasts_file)){
# if(file.exists(constrasts_file))
echo("using contrasts matrix from: ", basename(constrasts_file))
compPairsUniDir <- read.delim(opts$sample_pairs) %>% set_names("sample_1", "sample_2")
## add other direction for completeness
compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
}else{
echo("comparing all samples against each other")
compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}
......@@ -200,11 +210,6 @@ degs <- transform(degs, sample_1_overex=log2_fold_change<0)
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## todo move up into table export section
## add gene description
......@@ -220,37 +225,45 @@ if(!file.exists(cacheFile)){
geneInfo <- local(get(load(cacheFile)))
}
gFpkmMatrix <- rownames2column(fpkmMatrix(genes(cuff)), "ensembl_gene_id")
#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
geneFpkmWithInfo <- merge(geneInfo, gFpkmMatrix, by.x="ensembl_gene_id", all.y=T) %>% print_head()
write.delim(geneFpkmWithInfo, file="geneFpkmWithInfo.txt")
# geneFpkmWithInfo <- read.delim("geneFpkmWithInfo.txt")
#' [Annotated Expresssion Table](geneFpkmWithInfo.txt)
degs <- merge(degs, geneInfo, by.x="gene_id", by.="ensembl_gene_id", all.x=T)
#' differentially expressed genes
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex)))
#subset(degs, !duplicated(paste(sample_1, sample_2, sample_1_overex, external_gene_name))) %>%
# with(as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>%
# filter(Freq>0)
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [dge table](degs.txt)
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
#` ## Term enrichment
#' ## Term enrichment
#+ echo=FALSE
fpkmMatrix(genes(cuff)) %>% head()
ontologies <- c(
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
"KEGG_PATHWAY",
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT"
)
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=',')`
require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/)
geneLists <- degs %>%
......@@ -262,7 +275,7 @@ geneLists <- degs %>%
davidAnnotationChart <- function(someGenes){ ## expexted to have a column with gene_id
echo("processing list with", length(someGenes), "genes")
# echo("processing list with", length(someGenes), "genes")
# someGenes <- degs$gene_id
......@@ -277,19 +290,7 @@ davidAnnotationChart <- function(someGenes){ ## expexted to have a column with g
result<-addList(david, someGenes, idType="ENSEMBL_GENE_ID", listName=paste0("list_", sample(10000)[1]), listType="Gene")
david %>% setAnnotationCategories(c(
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
"KEGG_PATHWAY",
"GOTERM_CC_FAT",
"GOTERM_MF_FAT",
"GOTERM_BP_FAT"
))
david %>% setAnnotationCategories(ontologies)
annoChart <-getFunctionalAnnotationChart(david)
......@@ -350,9 +351,11 @@ sigEnrResults %>% do({
#write.table(sigEnrResults, file=concat("sigEnrTerms.txt"), row.names=FALSE, sep="\t")
#+ include=FALSE
# ########################################################################################################################
##' ## Enriched KEGG Pathways
# #' ## Enriched KEGG Pathways
#
#require(pathview)
#require(png)
......@@ -414,4 +417,6 @@ sigEnrResults %>% do({
# # string_db$plot_network(network, add_link=FALSE)
# # }
# #}
#
\ No newline at end of file
#
#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`
\ No newline at end of file
......@@ -24,12 +24,18 @@ Options:
-m Show Messages
--keep Keep generated Rmd and md files
'
#!docopt(doc, "-w test a b c ")$keep
#docopt(doc, "-w test a b c ")$"-w"
spin_opts <- docopt(doc)
#print(spin_opts)
r_script <- spin_opts$r_script
keep_markdown_files <- as.logical(spin_opts$keep)
if(keep_markdown_files){
print("keeping markdown files")
}
if(!file.exists(r_script)){
stop(paste("file does not exist\n", doc))
......@@ -79,7 +85,7 @@ opts_chunk$set(
knit2html(basename(rmdScript), header=cssHeader)
## also remove the .md and the .Rmd files
if(!spin_opts$keep){
if(is.logical(keep_markdown_files) & !keep_markdown_files){
file.remove(basename(rmdScript))
file.remove(basename(str_replace(r_script, "[.]R$", ".md")))
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment