Commit 216fe1dd authored by Holger Brandl's avatar Holger Brandl

cont cs analysis

parent d3140b19
......@@ -3,7 +3,7 @@
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
......@@ -60,6 +60,10 @@ countDataNorm <- countData %>%
# mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
# mutate(tag_fpkm100=100*tag_fpkm)
#' apply fpkm normalization for expression profile visualzation
countDataNorm %>% sample_n(10) %>% kable()
countDataNorm %$% unique(sample)
countDataNorm %>%
sample_n(10000) %>%
......@@ -68,17 +72,78 @@ countDataNorm %>%
scale_x_log10(labels=comma) +
ggtitle("fpkm dist overview")
#+ fig.width=20, fig.height=18
countDataNorm %>% ggplot(aes(tag_fpkm)) +
geom_histogram() +
facet_grid(feature_type ~ sample) +
scale_x_log10()
scale_x_log10() + ggtitle("FPKM distirbution by sample and region type")
#' Where is more signal
fpkmSummary <- countDataNorm %>% group_by(feature_type, sample) %>% summarize(total_fpkm=sum(tag_fpkm))
fpkmSummary %>% ggplot(aes(sample, total_fpkm, fill=feature_type)) +
geom_bar(position="dodge", stat="identity") +
ggtitle("ChIP Signal Enrichment by Region")
## export a count matrices for dba
exportRegion="tss_2kb"
countDataNorm %>% ungroup() %>%
## later do for each feature type
# group_by(feature_type
filter(feature_type==exportRegion) %>%
## todo fix replicate hack here
transmute(ensembl_gene_id, tag_count, sample=paste0(sample, ".1")) %>%
spread(sample, tag_count) %>%
write.delim(paste0("replicate_counts.", exportRegion, ".txt"))
write.delim(countDataNorm, file="countDataNorm.txt")
# countDataNorm <- read.delim("countDataNorm.txt")
#' [countDataNorm](countDataNorm.txt)
## simple correlation
with(countDataNorm, as.data.frame(table(timepoint, protein)))
#' Performing differential binding analysis for region type `r exportRegion`
countMatrix <- countData %>% filter(feature_type==exportRegion) %>%
# group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
dcast(ensembl_gene_id ~ sample, value.var="tag_count") %>%
column2rownames("ensembl_gene_id") %>% as.matrix()
countMatrix[is.na(countMatrix)] <- 0
cor(countMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab() + ggtitle("count correlation")
fpkmMatrix <- countDataNorm %>% filter(feature_type==exportRegion) %>%
# group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
dcast(ensembl_gene_id ~ sample, value.var="tag_fpkm") %>%
column2rownames("ensembl_gene_id") %>% as.matrix()
fpkmMatrix[is.na(fpkmMatrix)] <- 0
cor(fpkmMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab()+ ggtitle("fpkm correlation")
#' Correlation Scatter Plots
## all vs all correlation
allCor <- countDataNorm %>% ungroup() %>%
filter(feature_type==exportRegion) %>%
select(ensembl_gene_id, tag_fpkm, sample) %>%
merge(., ., by=c("ensembl_gene_id")) %>%
rename(sample_1=sample.x, sample_2=sample.y)
allCor %>% ggplot(aes(tag_fpkm.x, tag_fpkm.y)) +
geom_point(alpha=0.05) +
ggtitle("Timepoint Correlation") +
facet_grid(sample_1~sample_2)+
# facet_wrap(timepoint+sample_1~sample_2)+
# scale_x_log10() + scale_y_log10()
scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
geom_abline(slope = 1, alpha=0.3, size=2, color="red") +
xlab("fpkm") + ylab("fpkm")
#' Timepoint correlation
timeCor <- countDataNorm %>% ungroup() %>%
filter(feature_type=="tss_2kb") %>%
select(ensembl_gene_id, tag_fpkm, protein, timepoint) %>%
......@@ -94,9 +159,9 @@ timeCorPlot <- timeCor %>% ggplot(aes(tag_fpkm.x, tag_fpkm.y)) +
# scale_x_log10() + scale_y_log10()
scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
geom_abline(slope = 1, alpha=0.3, size=2, color="red") +
xlab("fpkm") + ylab("ylab")
xlab("fpkm") + ylab("fpkm")
timeCorPlot
ggsave2(timeCorPlot, outputFormat="pdf", width=12, height=10)
#
......
......@@ -2,8 +2,6 @@
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
#devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/bio/diffex_commons.R")
......@@ -22,27 +20,11 @@ geneInfo <- quote({
}) %>%
cache_it()
## todo use docopt here
countData <- read.delim("replicate_counts.tss_2kb.txt")
names(countData) <- names(countData) %>% str_replace("[.]1", "")
########################################################################################################################
#' # Correlation Analysis
#countMatrix <- matrix(1:400,ncol=2)
#colData <- data.frame(condition=factor(c("a","b")))
regionTypeDBA <- "tss_1kb"
#' Performing differential binding analysis for region type `r regionTypeDBA`
countMatrix <- countData %>% filter(feature_type==regionTypeDBA) %>%
# group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
dcast(ensembl_gene_id ~ sample, value.var="tag_count") %>%
column2rownames("ensembl_gene_id") %>% as.matrix()
countMatrix[is.na(countMatrix)] <- 0
cor(countMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab()
countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()
########################################################################################################################
#' # Differential binding analysis
......@@ -102,6 +84,10 @@ deResults %>% ggplot(aes(log2FoldChange)) +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
xlim(-4,4)
deResults %>% ggplot(aes(pvalue)) +
geom_histogram() +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue")
### see https://www.biostars.org/p/80448/ for Pairwise Comparison
#res <- results(dds, contrast=c(c("condition","H1M_Dome","H1M_Shield")))
#res <- results(dds, contrast=c(c("condition","H3K4_Shield","H1M_Shield")))
......@@ -110,7 +96,6 @@ deResults %>% ggplot(aes(log2FoldChange)) +
deResults %<>% mutate(is_hit=pvalue<0.01)
degs <- deResults %>% filter(is_hit)
ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary by direction of expression") + coord_flip()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment