Commit a5fd244b authored by brandl's avatar brandl

cleanup and prepared for handover

parent f26f0d50
Copyright 2017 Max Planck Institute of Molecular Cell Biology and Genetics, Dresden, Germany Copyright 2018 Max Planck Institute of Molecular Cell Biology and Genetics, Dresden, Germany
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
......
#!/usr/bin/env Rscript
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.4/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.4/R/ggplot_commons.R")
## todo replace with ngs_tools/dge_workflow/diffex_commons.R
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
require.auto(digest)
#' # Region Count Analysis
## spin.R
geneInfo <- quote({
mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>%
cache_it()
## todo use docopt here
countData <- read.delim("replicate_counts.tss_2kb.txt")
names(countData) <- names(countData) %>% str_replace("[.]1", "")
countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()
########################################################################################################################
#' # Differential binding analysis
hasContastsFile=F
if(hasContastsFile){
contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
}else{
contrasts <- data.frame(sample=colnames(countMatrix)) %>%
merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
filter(ac(sample_1)>ac(sample_2)) %>%
# filter(ac(sample_1)!=ac(sample_2)) %>%
fac2char
write.delim(contrasts.txt, "contrasts.txt")
}
#+ results='asis'
#' Used contrasts model is
contrasts %>% kable()
# See deseq [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
require.auto(DESeq2)
require.auto(gplots)
## todo make sure that control comes first to get fold-changes right
#Note: In order to benefit from the default settings of the package, you should put the variable of interest
#at the end of the formula and make sure the control level is the first level.
colData <- data.frame(condition=colnames(countMatrix))
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
#dds <- DESeq(dds)
dds <- DESeq(dds, fitType='local')
res <- results(dds)
resultsNames(dds)
summary(res)
deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# browser()
results(dds, contrast=c("condition",sample_1,sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
mutate(
sample_1=sample_1,
sample_2=sample_2,
sample_1_overex=log2FoldChange<0
)
}), .progress="text")
deResults %>% with(as.data.frame(table(sample_1, sample_2)))
#+ fig.width=20, fig.height=18
deResults %>% ggplot(aes(log2FoldChange)) +
geom_histogram(binwidth=0.1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0, color="blue") +
xlim(-4,4)
deResults %>% ggplot(aes(pvalue)) +
geom_histogram() +
facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0.01, slope=1, color="blue")
### see https://www.biostars.org/p/80448/ for Pairwise Comparison
#res <- results(dds, contrast=c(c("condition","H1M_Dome","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H3K4_Shield","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H1M_Shield", "H3K4_Shield")))
deResults %<>% mutate(is_hit=pvalue<0.01)
write.delim(deResults, file="deResults.txt")
# deResults <- read.delim("deResults.txt")
#' [deResults](deResults.txt)
degs <- deResults %>% filter(is_hit)
ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary by direction of expression") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
#res %>% as.df() %>% ggplot(aes(pvalue)) + geom_histogram()
#res %>% as.df() %>% ggplot(aes(padj)) + geom_histogram()
## todo add gene info
degs %>% inner_join(geneInfo) %>%write.delim(file="degs.txt")
# degs <- read.delim("degs.txt")
#' [degs](degs.txt)
plotMA(res, main="DESeq2", ylim=c(-2,2))
## note see ?DESeq section on 'Experiments without replicates'
## hwo to specify which conditions to run the test on
## http://seqanswers.com/forums/showthread.php?t=28350
#For making comparisons of multiple conditions (not only against the base level of a condition), we have recently implemented contrasts in the development branch. This allows one to fit a single model, then generate log2 fold change estimates, standard errors and tests of null hypotheses for other comparisons.
#+ eval=FALSE, echo=FALSE, include=F
## DEBUG start
if(F){
## DEBUG end
## base means (see http://seqanswers.com/forums/showthread.php?t=28350&page=2)
baseMeanPerLvl <- sapply(levels(colData(dds)$condition), function(lvl) rowMeans(counts(dds,normalized=TRUE)[,colData(dds)$condition == lvl]))
counts(dds,normalized=TRUE) %>% as.df() %>% set_names(colData(dds)$condition) %>%
rownames2column("ensembl_gene_id") %>%
filter(ensembl_gene_id=="ENSDARG00000098036")
countDataNorm %>% filter(ensembl_gene_id=="ENSDARG00000000324", sample %in% c("H3HA_Oblong", "H3HA_Dome"), feature_type==regionTypeDBA)
#baseMean log2FoldChange lfcSE stat pvalue padj ensembl_gene_id sample_1 sample_2 sample_1_overex
#19 12.529275 0.95396312 0.6345064 1.5034728 0.13271717 0.8131601 ENSDARG00000000324 H3HA_Oblong H3HA_Dome FALSE
}
########################################################################################################################
#' ## Term enrichment
#+ echo=FALSE
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
split_hit_list <- F
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <-
dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
head(1) %>% apply(1, paste, collapse="_vs_")
echo("processing", label)
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
ggsave(paste0(label, " enrichmed terms.pdf"))
print(logPlot)
})
#ggsave2()
########################################################################################################################
### Chromosome Sorting
## By default the sorting of samtools faidx is not good, but we can resort it, and samtools sort will respect this sorting order
## To fix existing alginments either use ReorderSam or reheader | resort with samtools
## check consist sorting order
#samtools view /projects/bioinfo/holger/projects/krause_chipseq/alignments_mmfilt/H3HA_Sphere_mmf.bam | cut -f 3 | uniq
#sort -k1,1g -k2,2n $regionModel.bed | cut -f1 | uniq
### --> use reorder sam http://sourceforge.net/p/samtools/mailman/samtools-help/thread/4DB6F0CD.6050403@umdnj.edu/
sort -k1V $bowtieIndex.fa.dict_tmp > $bowtieIndex.dict
samtools view -h $bamFile | removeMultiMappers > ${bamBaseName}_mmf.tmp.bam
samtools index ${bamBaseName}_mmf.tmp.bam # because ReorderSam runs substantially faster if the input is an indexed BAM file.
# Use Picard ResortSam for resorting --> this will sort according to chromosome order in reference fasta only
picard ReorderSam I=${bamBaseName}_mmf.tmp.bam O=${bamBaseName}_mmf.bam REFERENCE=$bowtieIndex.fa
## samtools view -h $bamFile | removeMultiMappers | picard ReorderSam I=/dev/stdin O=${bamBaseName}_mmf.bam REFERENCE=$bowtieIndex.fa
samtools index ${bamBaseName}_mmf.bam
## extract chromosome from bam file
samtools view -bo test.bam /home/brandl/mnt/chip-seq_study/ChIPSeq_February_2014/alignments_trimmed_nomulti_pooled/H2Az.bam 1
########################################################################################################################
## macs2 playground
macs2 -n --broad --gsize
#macs2 -n --gsize
########################################################################################################################
### deeptools
## https://github.com/fidelram/deepTools/wiki/All-command-line-options#bamCorrelate
#bamCorrelate bins --bamfiles $bamFiles --region 10:1:100000 --plotFile="bam_correlation.png" --numberOfProcessors=4 --corMethod spearman
## see how well bam files correlate using untrimmed data
bamCorrelate bins --bamfiles $bamFiles --plotFile="bam_correlation_untrimmed.png" --numberOfProcessors=4 --corMethod spearman
########################################################################################################################
### bam-index all files in directory
#for bamFile in $(ls $baseDir/alignments_trimmed_nomulti_pooled/*.bam); do (samtools index $bamFile ) & done
########################################################################################################################
### Peak calling with SPP
Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c="../alignments_untrimmed/H2Az_Rep1_Lane2_Lib4454.0001.bam" -savp -out="test.txt"
########################################################################################################################
### other qc
## http://www.nature.com/nmeth/journal/v11/n1/full/nmeth.2786.html
# Teytelman et al. discovered that highly expressed loci were always enriched in ChIP peaks, regardless of which protein was pulled dow
#########################################################################################################################
#### CHANCE
mcdir $baseDir/qc_chance
## convert example to sam format
bamToBed -i $baseDir/alignments_trimmed/H2Az_Rep1_Lane1_Lib4454_ca.bam > H2Az_Rep1_Lane1_Lib4454_ca.bed
samtools view -h -o H2Az_Rep1_Lane1_Lib4454_ca.sam $baseDir/alignments_trimmed/H2Az_Rep1_Lane1_Lib4454_ca.bam
/local/home/henry/bin/CHANCE/run_chance_linux.sh /usr/local/MATLAB/MATLAB_Compiler_Runtime/v717/
########################################################################################################################
## Use phantompeakqualtools to calculate quality tag (strand cross-correlation of peak)
# did the antibody-treatment enrich sufficiently so that the ChIP signal can be separated from the background signal?
## --> use https://code.google.com/p/phantompeakqualtools/
mcdir $baseDir/qc_phantompeaks
#Rscript /local/home/brandl/bin/phantompeakqualtools/run_spp.R
#Rscript run_spp.R <bamfile> -savp -out=<outfile>
ctrlBam=$baseDir/alignments_untrimmed/H3-3_Rep1_Lane1_Lib4453.bam
#DEBUG ctrlBam=$baseDir/alignments_untrimmed/H3K4me3_Rep2_Lane2_Lib4455.bam
Rscript /projects/bioinfo/holger/bin/phantompeakqualtools/run_spp.R
for bamFile in $baseDir/alignments_untrimmed/*bam ; do
echo "processing $bamFile..."
# Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -savp -ou t=$(basename $bamFile).pt.log
## one output file is enough because output is tabular including inpuyt file name
# Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -i=$ctrlBam -savp -savd -odir=$(pwd) -out=phantom_qc.txt
## with control
# ( Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -savp -savd -odir=$(pwd) -out=phantom_qc_noctrl.txt &> $(basename $bamFile).pt.log ) &
## without control
# ( Rscript /home/brandl/bin/phantompeakqualtools/run_spp.R -c=$bamFile -savp -savd -odir=$(pwd) -out=phantom_qc_noctrl.txt &> $(basename $bamFile).pt.log ) &
done
wait
## todo visualize results
#phantom output format
#format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>Normalized SCC (NSC)<tab>Relative SCC (RSC)<tab>QualityTag)
mailme "phantom qc done"
########################################################################################################################
### sorted counting
#
## perform the counts analysis (see https://www.biostars.org/p/13980/)
## presort the bed files for faster counting
for bedFile in $(ls *bed | grep -v sortedbam | grep -v sorted | grep -v pooled); do
sort -k1,1 -k2,2n $bedFile > ${bedFile%%.bed}.sorted.bed
done
ll $bamFiles
## convert the bams to sorted bed for fast counting
for bamFile in $bamFiles; do
# DEBUG bamFile=/projects/bioinfo/holger/projects/khan_chipsseq_h1/alignments_mmfilt/H1M_Dome_mmf.bam
bamBaseName=$(basename $(basename $bamFile .bam) _mmf)
mysub "${project}__ib__${bamBaseName}" " bamToBed -i $bamFile | sort -k1,1 -k2,2n > ${bamBaseName}.sortedbam.bed" | joblist .bam2bed
done
wait4jobs .bam2bed
for bamBedFile in $(ls *sortedbam.bed); do
for sortedBed in $(ls *sorted.bed); do
# for sortedBed in zv9_ens79_promoters1kb.sorted.bed; do
# for sortedBed in zv9_ens79_firstex.sorted.bed zv9_ens79_lastex.sorted.bed; do
# DEBUG bamBedFile=H3K4_Shield.sortedbam.bed
# DEBUG sortedBed=zv9_ens79_genebodies.sorted.bed
(
bamBaseName=$(basename $bamBedFile .sortedbam.bed)
bedBaseName=$(basename $sortedBed .sorted.bed | sed 's/zv9_ens79_//g')
## note bam-mode not yet supported for sorted
# mysub "${project}__ib__${bamBaseName}" "
intersectBed -c -sorted -a $sortedBed -b $bamBedFile > ${bamBaseName}.${bedBaseName}.region_counts.txt
# " | joblist .counts
# bamToBed -i $bamFile | sort -k1,1 -k2,2n | intersectBed -c -sorted -a $sortedBed -b stdin > ${bamBaseName}.${bedBaseName}.region_counts.txt
# bamToBed -i $bamFile | sort -k1,1 -k2,2n > ${bamBaseName}.sorted.bed
# intersectBed -c -sorted -a $regionModel.sorted.bed -b ${bamBaseName}.sorted.bed > ${bamBaseName}.region_counts.txt
)&
done
done
wait
#wait4jobs .counts
mailme "${project} counts analysis done"
########################################################################################################################
### motifs
## run meme-chip
########################################################################################################################
### profiles
## try deepTools
## differential binding diffbind
diffReps
"Negative Binomial tests implemented in diffReps (Shen et al. 2013) were used to detect differential histone
modification regions using a sliding window of 600bp (H3K4me3) or 5000bp (H3K27me3), p-value cutoff 1e-
6, sharp peaks mode for H3K4me3 (--nsb 20) and broad peak mode for H3K27me3 (--nsb 2), hg19 as reference
genome, and an average fragment size of 250bp (rest of parameters default). Differential histone modifications
regions not overlapping (at least 1bp) significant chromatin marks previously detected during peak calling at
least in one of conditions under comparison were removed. Regions were ranked by their fold-change (FC), and
reported as differentially enriched only if the absolute FC≥1.5, and Benjamini-Hochberg corrected p-value≤1e-3.
As described on the figures, figure legends and results, for Fig. S6C and S6D only the thresholds for
statistical significance were relaxed as it follows: binomial test p-value cutoff 1e-4 (diffReps standard); absolute
FC≥1.25, and Benjamini-Hochberg corrected p-value≤1e-2 (FDR 1%)."
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
suppressMessages(require(docopt)) suppressMessages(require(docopt))
# todo refac to use tsv input along with user provided grouping attribute
doc <- ' doc <- '
Perform an enrichment analysis for a set of genes Perform an enrichment analysis for a set of genes
Usage: david_enrichment.R [options] <grouped_gene_lists_rdata> Usage: david_enrichment.R [options] <grouped_gene_lists_rdata>
...@@ -20,6 +22,10 @@ devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/core_ ...@@ -20,6 +22,10 @@ devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/core_
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/ggplot_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/bio/diffex_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/bio/diffex_commons.R")
source(file.path(Sys.getenv("NGS_TOOLS"), "common/david_enrichment_util.R"))
## todo used tagged version instead
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v12/common/david_enrichment_util.RR")
require.auto(knitr) require.auto(knitr)
## to fix child support issue with knitr, see also ## to fix child support issue with knitr, see also
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf ## http://www.bioconductor.org/packages/release/bioc/vignettes/RDAVIDWebService/inst/doc/RDavidWS-vignette.pdf
## e.g. getClusterReport --> plot2D ## e.g. getClusterReport --> plot2D
## complete list can be fetched using getAllAnnotationCategoryNames(david)
DEF_DAVID_ONTOLOGIES=ontologies=c("GOTERM_CC_FAT", "GOTERM_MF_FAT", "GOTERM_BP_FAT", "PANTHER_PATHWAY", "PANTHER_FAMILY", "PANTHER_PATHWAY", "KEGG_PATHWAY", "REACTOME_PATHWAY") DEF_DAVID_ONTOLOGIES=ontologies=c("GOTERM_CC_FAT", "GOTERM_MF_FAT", "GOTERM_BP_FAT", "PANTHER_PATHWAY", "PANTHER_FAMILY", "PANTHER_PATHWAY", "KEGG_PATHWAY", "REACTOME_PATHWAY")
davidAnnotationChart <- function( someGenes, ontologies=DEF_DAVID_ONTOLOGIES ){ davidAnnotationChart <- function( someGenes, ontologies=DEF_DAVID_ONTOLOGIES ){
...@@ -18,10 +19,12 @@ davidAnnotationChart <- function( someGenes, ontologies=DEF_DAVID_ONTOLOGIES ){ ...@@ -18,10 +19,12 @@ davidAnnotationChart <- function( someGenes, ontologies=DEF_DAVID_ONTOLOGIES ){
if(length(someGenes)>1500){ if(length(someGenes)>1500){
# stop("it's up to you to reduce your input to 1500 genes")
warning("trimming david query list to 1500 entries")
someGenes <- sample(someGenes) %>% head(1500) someGenes <- sample(someGenes) %>% head(1500)
} }
david<-DAVIDWebService$new(email="brandl@mpi-cbg.de") david<-DAVIDWebService$new(email="bioinformatics@mpi-cbg.de")
# ## list all ontologies # ## list all ontologies
# getAllAnnotationCategoryNames(david) # getAllAnnotationCategoryNames(david)
......
...@@ -8,11 +8,6 @@ Links & Todo ...@@ -8,11 +8,6 @@ Links & Todo
Other enrichment tools Other enrichment tools
* http://amp.pharm.mssm.edu/Enrichr/ * http://amp.pharm.mssm.edu/Enrichr/
* http://amp.pharm.mssm.edu/Harmonizome/ * http://amp.pharm.mssm.edu/Harmonizome/
Other interesting tools to perform enrichment testing
-----------------------------------------------------
* [piano](http://www.bioconductor.org/packages/release/bioc/html/piano.html) * [piano](http://www.bioconductor.org/packages/release/bioc/html/piano.html)
......
#!/usr/bin/env Rscript
#+ setup, cache=FALSE
suppressMessages(require(docopt))
doc <- '
Convert a cuffdiff into a dge-report
Usage: cuffdiff_report.R [options] <cuffdb_directory>
Options:
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--minfpkm <minfpkm> Minimal expression in any of the sample to be included in the final result list [default: 1]
'
#--directed Perform directed dge-steps in a directed manner
#-S Dont do general statistics and qc analysis
#print("args ars")
#print(commandArgs(TRUE))
opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
## florio_11b: setwd("/lustre/projects/bioinfo/holger/projects/florio_11b/cuffdiff/dge_report"); opts <- docopt(doc, "--constrasts contrasts.txt --pcutoff 0.01 /projects/bioinfo/holger/projects/florio_11b/cuffdiff")
# opts <- docopt(doc, "--undirected ..")
# opts <- docopt(doc, "..")
# opts <- docopt(doc, "--pcutoff 0.05 ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
## todo use commandArgs here!!
#opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
#print(opts)
skipQC <<- opts$S
#split_hit_list <- opts$directed
constrasts_file <- opts$constrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL
minFPKM <- as.numeric(opts$minfpkm)
## do basic validation of configuration
stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))
cuffdb_directory <- normalizePath(opts$cuffdb_directory)
if(is.na(file.info(cuffdb_directory)$isdir)){
stop(paste("db directory does not exist", doc))
}
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/bio/diffex_commons.R")
#+ results='asis', echo=FALSE
cat('
<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css">
<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script>
<script type="text/javascript">
$(document).ready(function() {
// alert("test")
$(".dtable").DataTable();
} );
</script>
')
require(cummeRbund)
require(knitr)
## reload dplyr to fix namespace issues
unloadNamespace('dplyr'); require(dplyr)
#iris %>% count(Species)
#' Used DGE Analysis Parameters
#vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
########################################################################################################################
#' # Differential Gene Expression Analysis
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#+ load_db, cache=FALSE, warning=FALSE
isCluster=Sys.getenv("LSF_SERVERDIR")!=""
isBioinfoMountPoint=str_detect(normalizePath(cuffdb_directory), fixed("/mnt/"))
isLocalMountPoint=str_detect(normalizePath(cuffdb_directory), fixed("/Volumes/projects"))
dbTempCopyRequired=isCluster | isBioinfoMountPoint | isLocalMountPoint
if(dbTempCopyRequired){
## workaround until sqlite is fixed on lustre
tmpdir <- tempfile("cuffdb_dir")
system(paste("cp -r", cuffdb_directory, tmpdir))
cuff <- readCufflinks(dir=tmpdir)
}else{
cuff <- readCufflinks(dir="..")
}
#' The expression tracking reference contained:
cuff
#' Used CuffDiff:
runInfo(cuff) %>% filter(param!="cmd_line") %>% kable()
#' cuffdiff cmd was
#+ results='asis'
cat((runInfo(cuff) %>% filter(param=="cmd_line"))$value %>% gsub('(.{1,80})(,|$|\\s)', '\\1\n', .))
#' ### Samples
replicateInfo <- replicates(cuff) %>%
mutate(file=basename(file)) %>%
select(-c(total_mass, norm_mass, internal_scale, external_scale))
replicateInfo %>% kable() # kable("html", table.attr = "class='dtable'", escape=F)
write.delim(replicateInfo, file="replicate_info.txt")
# replicateInfo <- read.delim("replicate_info.txt")
#' [replicateInfo](replicate_info.txt)
#+ results='asis'
## define or load the contrasts
dbContrasts <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
if(!is.null(constrasts_file)){
echo("Using contrasts matrix from: ", basename(constrasts_file))
contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
## filter for the direction present in cuffdb
contrasts <- rbind_list(contrasts, contrasts %>% select(sample_1=sample_2, sample_2=sample_1)) %>% inner_join(dbContrasts)
}else{
echo("Comparing all samples against each other")
contrasts <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}
#' The used contrasts model is
contrasts %>% kable()
## download gene info and locations from biomart
geneInfo <- quote({
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
mart <- biomaRt::useDataset(martName, mart = biomaRt::useMart("ensembl"))
biomaRt::getBM(c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mart)
}) %>% cache_it("gene_info")
geneLoci <- quote({
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
mart <- biomaRt::useDataset(martName, mart = biomaRt::useMart("ensembl"))
biomaRt::getBM(c("ensembl_gene_id", "chromosome_name", "start_position", "end_position"), mart=mart)
}) %>% cache_it("gene_loci")
#knitr::opts_knit$set(root.dir = getwd())
#######################################################################################################################
#' ## Count and Expression Score Tables
#' All data has been annotated with ensembl gene information and was exprorted into . Both normalized (FPKM) and raw counts were exported.
fpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %T>%
glimpse() %>%
write.delim(file="gene_fpkms.txt")
# gene_fpkms <- read.delim("gene_fpkms.txt")
#' [Normalized Counts](gene_fpkms.txt)
## export the same but now including replicate information
repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %T>%
glimpse() %>%
write.delim(file="gene_fpkms_by_replicate.txt")
# gene_fpkms_by_replicate <- read.delim("gene_fpkms_by_replicate.txt")
#' [Normalized Counts by Replicate](gene_fpkms_by_replicate.txt)
## also export count tables
countMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="gene_counts.txt")
# gene_counts <- read.delim("gene_counts.txt")
#' [Gene Counts](gene_counts.txt)
repCountMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
write.delim(file="gene_counts_by_replicate.txt")
# rep_counts_by_replicate <- read.delim("gene_counts_by_replicate.txt")
#' [Gene Counts by Replicate](gene_counts_by_replicate.txt)
#######################################################################################################################
#' ## Quality Control
#' Create some global summary statistic plots in order to get an idea about the general quaniltiy and distribution of the expression data. Descriptions are copied from [Cummerbund Manual](http://compbio.mit.edu/cummeRbund/manual_2_0.html)
#' Several plotting methods are available that allow for quality-control or global analysis of cufflinks data. A good place to begin is to evaluate the quality of the model fitting. Overdispersion is a common problem in RNA-Seq data. As of cufflinks v2.0 mean counts, variance, and dispersion are all emitted, allowing you to visualize the estimated overdispersion for each sample as a quality control measure.
dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
#' The coefficient of variation (CV) is defined as the ratio of the standard deviation the mean. The squared coefficient of variation is a normalized measure of cross-replicate variability that can be useful for evaluating the quality your RNA-seq data. Differences in CV2 can result in lower numbers of differentially expressed genes due to a higher degree of variability between replicate fpkm estimates.
fpkmSCVPlot(genes(cuff))
#fpkmSCVPlot(genes(cuff), showPool=T)
#' To assess the distributions of FPKM scores across samples, we plot their densities
#csDensity(genes(cuff))
#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
#csDensity(genes(cuff),features=T)
csBoxplot(genes(cuff),replicates=T)
csBoxplot(genes(cuff))
csScatterMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#' For the volcano plot matrix we calculate fold-changes as x/y (ie column/row). For details see [here](http://rpackages.ianhowson.com/bioc/cummeRbund/man/csVolcano.html). Signifcance coloring is based on cuffdiff's internal score an not our hit criterion.
csVolcanoMatrix(genes(cuff))
#' We also plot more detailed scatter plots for just the contrasts for interest. Signifcance coloring is based on cuffdiff's internal score an not our hit criterion.