Commit 2109980a authored by Holger Brandl's avatar Holger Brandl

cont cs analysis

parent 4943ea05
#!/usr/bin/env Rscript
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
require.auto(digest)
#' # Region Count Analysis
## spin.R
geneInfo <- quote({
mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>%
cache_it()
countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
}) %>%
## calcualte feature length for normalization
mutate(feature_length=feature_end-feature_start+1) %>%
## discard unused columns
select(-c(score, strand, feature_start, feature_end)) %>%
## extract time and protein
separate(sample, c("protein", "timepoint"), remove=F)
write.delim(countData, file="countData.txt")
# countData <- read.delim("countData.txt")
#' [countData](countData.txt)
countData %>% head() %>% kable()
#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>% dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
#require(GGally)
#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
## does not work because of logarithmic scale
## calculate fpkm normalized counts
countDataNorm <- countData %>%
group_by(feature_type, protein, timepoint) %>%
mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
mutate(tag_fpkm100=100*tag_fpkm)
#+ fig.width=20, fig.height=18
countDataNorm %>% ggplot(aes(tag_fpkm)) +
geom_histogram() +
facet_grid(feature_type ~ sample) +
scale_x_log10()
write.delim(countDataNorm, file="countDataNorm.txt")
# countDataNorm <- read.delim("countDataNorm.txt")
#' [countDataNorm](countDataNorm.txt)
########################################################################################################################
#' # Correlation Analysis
#countMatrix <- matrix(1:400,ncol=2)
#colData <- data.frame(condition=factor(c("a","b")))
regionTypeDBA <- "tss_1kb"
#' Performing differential binding analysis for region type `r regionTypeDBA`
countMatrix <- countData %>% filter(feature_type==regionTypeDBA) %>%
# group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
dcast(ensembl_gene_id ~ sample, value.var="tag_count") %>%
column2rownames("ensembl_gene_id") %>% as.matrix()
countMatrix[is.na(countMatrix)] <- 0
cor(countMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab()
########################################################################################################################
#' # Differential binding analysis
hasContastsFile=F
if(hasContastsFile){
contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
}else{
contrasts <- data.frame(sample=colnames(countMatrix)) %>%
merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
filter(ac(sample_1)>ac(sample_2)) %>%
# filter(ac(sample_1)!=ac(sample_2)) %>%
fac2char
write.delim(contrasts.txt, "contrasts.txt")
}
#+ results='asis'
#' Used contrasts model is
contrasts %>% kable()
# See deseq [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
require.auto(DESeq2)
require.auto(gplots)
## todo make sure that control comes first to get fold-changes right
#Note: In order to benefit from the default settings of the package, you should put the variable of interest
#at the end of the formula and make sure the control level is the first level.
colData <- data.frame(condition=colnames(countMatrix))
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
#dds <- DESeq(dds)
dds <- DESeq(dds, fitType='local')
res <- results(dds)
resultsNames(dds)
summary(res)
deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# browser()
results(dds, contrast=c("condition",sample_1,sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
mutate(
sample_1=sample_1,
sample_2=sample_2,
sample_1_overex=log2FoldChange<0
)
}), .progress="text")
deResults %>% with(as.data.frame(table(sample_1, sample_2)))
#+ fig.width=20, fig.height=18
deResults %>% ggplot(aes(log2FoldChange)) +
geom_histogram(binwidth=0.1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
xlim(-4,4)
### see https://www.biostars.org/p/80448/ for Pairwise Comparison
#res <- results(dds, contrast=c(c("condition","H1M_Dome","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H3K4_Shield","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H1M_Shield", "H3K4_Shield")))
deResults %<>% mutate(is_hit=pvalue<0.01)
degs <- deResults %>% filter(is_hit)
ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=log2FoldChange>1)) + geom_bar(position="dodge") +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary by direction of expression") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
#res %>% as.df() %>% ggplot(aes(pvalue)) + geom_histogram()
#res %>% as.df() %>% ggplot(aes(padj)) + geom_histogram()
plotMA(res, main="DESeq2", ylim=c(-2,2))
## note see ?DESeq section on 'Experiments without replicates'
## hwo to specify which conditions to run the test on
## http://seqanswers.com/forums/showthread.php?t=28350
#For making comparisons of multiple conditions (not only against the base level of a condition), we have recently implemented contrasts in the development branch. This allows one to fit a single model, then generate log2 fold change estimates, standard errors and tests of null hypotheses for other comparisons.
#+ eval=FALSE, echo=FALSE, include=F
## DEBUG start
if(F){
## DEBUG end
## base means (see http://seqanswers.com/forums/showthread.php?t=28350&page=2)
baseMeanPerLvl <- sapply(levels(colData(dds)$condition), function(lvl) rowMeans(counts(dds,normalized=TRUE)[,colData(dds)$condition == lvl]))
counts(dds,normalized=TRUE) %>% as.df() %>% set_names(colData(dds)$condition) %>%
rownames2column("ensembl_gene_id") %>%
filter(ensembl_gene_id=="ENSDARG00000098036")
countDataNorm %>% filter(ensembl_gene_id=="ENSDARG00000000324", sample %in% c("H3HA_Oblong", "H3HA_Dome"), feature_type==regionTypeDBA)
#baseMean log2FoldChange lfcSE stat pvalue padj ensembl_gene_id sample_1 sample_2 sample_1_overex
#19 12.529275 0.95396312 0.6345064 1.5034728 0.13271717 0.8131601 ENSDARG00000000324 H3HA_Oblong H3HA_Dome FALSE
}
########################################################################################################################
#' ## Term enrichment
#+ echo=FALSE
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
split_hit_list <- F
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <-
dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
head(1) %>% apply(1, paste, collapse="_vs_")
echo("processing", label)
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
ggsave(paste0(label, " enrichmed terms.pdf"))
print(logPlot)
})
#ggsave2()
......@@ -7,7 +7,7 @@ require.auto(tidyr)
require(tidyr)
if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2__OLD_SORTING")
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
########################################################################################################################
#' # Called Peaks Overview
......@@ -49,6 +49,29 @@ peaks %>%
ggtitle("peak width distribution") +
xlim(0,1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0,1000) +
rotXlab()
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, score)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0,100) +
rotXlab()
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) +
......@@ -60,13 +83,31 @@ peaks %>%
ylim(0,1000)
#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar() + ggtitle("peak counts")
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts")
peaks %>%
sample_fract(0.1) %>%
arrange(-qvalue) %>% head
peaks %>% sample_frac(0.1) %>%
group_by(sample) %>%
arrange(-score) %>%
mutate(x=row_number()/n()) %>%
# sample_frac(0.3) %>%
ggplot(aes(qvalue, group=sample)) +
ggplot(aes(x, qvalue, color=sample)) +
geom_line() +
facet_wrap(~ peak_type)
facet_wrap(~ peak_type) +
xlim(0, 0.2) + ylim(0,50)
#
## chippeak anno
#+ eval=FALSE, echo=FALSE
## DEBUG start
if(F){
## https://www.biostars.org/p/115101/
mydata.ranged<-BED2RangedData("hmedip_allpeaks.bed",header=FALSE)
#http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf
}
## DEBUG end
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment