Skip to content
Snippets Groups Projects
Commit 2109980a authored by Holger Brandl's avatar Holger Brandl
Browse files

cont cs analysis

parent 4943ea05
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env Rscript
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
require.auto(digest)
#' # Region Count Analysis
## spin.R
geneInfo <- quote({
mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>%
cache_it()
countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
}) %>%
## calcualte feature length for normalization
mutate(feature_length=feature_end-feature_start+1) %>%
## discard unused columns
select(-c(score, strand, feature_start, feature_end)) %>%
## extract time and protein
separate(sample, c("protein", "timepoint"), remove=F)
write.delim(countData, file="countData.txt")
# countData <- read.delim("countData.txt")
#' [countData](countData.txt)
countData %>% head() %>% kable()
#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>% dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
#require(GGally)
#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
## does not work because of logarithmic scale
## calculate fpkm normalized counts
countDataNorm <- countData %>%
group_by(feature_type, protein, timepoint) %>%
mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
mutate(tag_fpkm100=100*tag_fpkm)
#+ fig.width=20, fig.height=18
countDataNorm %>% ggplot(aes(tag_fpkm)) +
geom_histogram() +
facet_grid(feature_type ~ sample) +
scale_x_log10()
write.delim(countDataNorm, file="countDataNorm.txt")
# countDataNorm <- read.delim("countDataNorm.txt")
#' [countDataNorm](countDataNorm.txt)
########################################################################################################################
#' # Correlation Analysis
#countMatrix <- matrix(1:400,ncol=2)
#colData <- data.frame(condition=factor(c("a","b")))
regionTypeDBA <- "tss_1kb"
#' Performing differential binding analysis for region type `r regionTypeDBA`
countMatrix <- countData %>% filter(feature_type==regionTypeDBA) %>%
# group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
dcast(ensembl_gene_id ~ sample, value.var="tag_count") %>%
column2rownames("ensembl_gene_id") %>% as.matrix()
countMatrix[is.na(countMatrix)] <- 0
cor(countMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab()
########################################################################################################################
#' # Differential binding analysis
hasContastsFile=F
if(hasContastsFile){
contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
}else{
contrasts <- data.frame(sample=colnames(countMatrix)) %>%
merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
filter(ac(sample_1)>ac(sample_2)) %>%
# filter(ac(sample_1)!=ac(sample_2)) %>%
fac2char
write.delim(contrasts.txt, "contrasts.txt")
}
#+ results='asis'
#' Used contrasts model is
contrasts %>% kable()
# See deseq [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
require.auto(DESeq2)
require.auto(gplots)
## todo make sure that control comes first to get fold-changes right
#Note: In order to benefit from the default settings of the package, you should put the variable of interest
#at the end of the formula and make sure the control level is the first level.
colData <- data.frame(condition=colnames(countMatrix))
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
#dds <- DESeq(dds)
dds <- DESeq(dds, fitType='local')
res <- results(dds)
resultsNames(dds)
summary(res)
deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# browser()
results(dds, contrast=c("condition",sample_1,sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
mutate(
sample_1=sample_1,
sample_2=sample_2,
sample_1_overex=log2FoldChange<0
)
}), .progress="text")
deResults %>% with(as.data.frame(table(sample_1, sample_2)))
#+ fig.width=20, fig.height=18
deResults %>% ggplot(aes(log2FoldChange)) +
geom_histogram(binwidth=0.1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
xlim(-4,4)
### see https://www.biostars.org/p/80448/ for Pairwise Comparison
#res <- results(dds, contrast=c(c("condition","H1M_Dome","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H3K4_Shield","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H1M_Shield", "H3K4_Shield")))
deResults %<>% mutate(is_hit=pvalue<0.01)
degs <- deResults %>% filter(is_hit)
ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=log2FoldChange>1)) + geom_bar(position="dodge") +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary by direction of expression") + coord_flip()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
#res %>% as.df() %>% ggplot(aes(pvalue)) + geom_histogram()
#res %>% as.df() %>% ggplot(aes(padj)) + geom_histogram()
plotMA(res, main="DESeq2", ylim=c(-2,2))
## note see ?DESeq section on 'Experiments without replicates'
## hwo to specify which conditions to run the test on
## http://seqanswers.com/forums/showthread.php?t=28350
#For making comparisons of multiple conditions (not only against the base level of a condition), we have recently implemented contrasts in the development branch. This allows one to fit a single model, then generate log2 fold change estimates, standard errors and tests of null hypotheses for other comparisons.
#+ eval=FALSE, echo=FALSE, include=F
## DEBUG start
if(F){
## DEBUG end
## base means (see http://seqanswers.com/forums/showthread.php?t=28350&page=2)
baseMeanPerLvl <- sapply(levels(colData(dds)$condition), function(lvl) rowMeans(counts(dds,normalized=TRUE)[,colData(dds)$condition == lvl]))
counts(dds,normalized=TRUE) %>% as.df() %>% set_names(colData(dds)$condition) %>%
rownames2column("ensembl_gene_id") %>%
filter(ensembl_gene_id=="ENSDARG00000098036")
countDataNorm %>% filter(ensembl_gene_id=="ENSDARG00000000324", sample %in% c("H3HA_Oblong", "H3HA_Dome"), feature_type==regionTypeDBA)
#baseMean log2FoldChange lfcSE stat pvalue padj ensembl_gene_id sample_1 sample_2 sample_1_overex
#19 12.529275 0.95396312 0.6345064 1.5034728 0.13271717 0.8131601 ENSDARG00000000324 H3HA_Oblong H3HA_Dome FALSE
}
########################################################################################################################
#' ## Term enrichment
#+ echo=FALSE
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
split_hit_list <- F
grpdDegs <- if(split_hit_list){
degs %>% group_by(sample_1, sample_2, sample_1_overex)
}else{
degs %>% group_by(sample_1, sample_2)
}
enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#' [Very Significant Terms](sigEnrResults.txt)
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
sigEnrResults %>% do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <-
dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
head(1) %>% apply(1, paste, collapse="_vs_")
echo("processing", label)
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
ggsave(paste0(label, " enrichmed terms.pdf"))
print(logPlot)
})
#ggsave2()
...@@ -7,7 +7,7 @@ require.auto(tidyr) ...@@ -7,7 +7,7 @@ require.auto(tidyr)
require(tidyr) require(tidyr)
if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/ if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2__OLD_SORTING") ## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
######################################################################################################################## ########################################################################################################################
#' # Called Peaks Overview #' # Called Peaks Overview
...@@ -49,6 +49,29 @@ peaks %>% ...@@ -49,6 +49,29 @@ peaks %>%
ggtitle("peak width distribution") + ggtitle("peak width distribution") +
xlim(0,1000) xlim(0,1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0,1000) +
rotXlab()
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, score)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0,100) +
rotXlab()
peaks %>% peaks %>%
# sample_frac(0.3) %>% # sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) + ggplot(aes(sample, peak_width)) +
...@@ -60,13 +83,31 @@ peaks %>% ...@@ -60,13 +83,31 @@ peaks %>%
ylim(0,1000) ylim(0,1000)
#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position)) #peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar() + ggtitle("peak counts") peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts")
peaks %>% peaks %>% sample_frac(0.1) %>%
sample_fract(0.1) %>% group_by(sample) %>%
arrange(-qvalue) %>% head arrange(-score) %>%
mutate(x=row_number()/n()) %>%
# sample_frac(0.3) %>% # sample_frac(0.3) %>%
ggplot(aes(qvalue, group=sample)) + ggplot(aes(x, qvalue, color=sample)) +
geom_line() + geom_line() +
facet_wrap(~ peak_type) facet_wrap(~ peak_type) +
xlim(0, 0.2) + ylim(0,50)
#
## chippeak anno
#+ eval=FALSE, echo=FALSE
## DEBUG start
if(F){
## https://www.biostars.org/p/115101/
mydata.ranged<-BED2RangedData("hmedip_allpeaks.bed",header=FALSE)
#http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf
}
## DEBUG end
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment