cont. refined dba analysis

c1c8e5ec · Holger Brandl · cce5b893 · cce5b893
Commit c1c8e5ec authored 10 years ago by Holger Brandl
--- a/chipseq_workflow/cs_compare_regions.R
+++ b/chipseq_workflow/cs_compare_regions.R
-#!/usr/bin/env Rscript
-
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
-#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/bio/diffex_commons.R")
-
-require(tidyr)
-require(knitr)
-require.auto(digest)
-
-
-
-#' # Region Count Analysis
-## spin.R
-
-geneInfo <- quote({
-        mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
-        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
-            biomaRt::getBM(mart=mart)
-    }) %>%
-    cache_it()
-
-
-countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
-        read.delim(countFile, header=F) %>%
-            set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
-            mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
-    }) %>%
-    ## calcualte feature length for normalization
-    mutate(feature_length=feature_end-feature_start+1) %>%
-    ## discard unused columns
-    select(-c(score, strand, feature_start, feature_end)) %>%
-    ## extract time and protein
-    separate(sample, c("protein", "timepoint"), remove=F)
-
-
-
-#' count data structure
-countData %>% sample_n(10) %>% kable()
-
-write.delim(countData, file="countData.txt")
-# countData <- read.delim("countData.txt")
-#'  [countData](countData.txt)
-
-#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>%  dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
-
-#require(GGally)
-#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
-## does not work because of logarithmic scale
-
-## also load library sizes for normalization
-libSizes <- read.delim("algn_counts.txt", header=F) %>% set_names(c("library_size", "sample"))
-
-## calculate fpkm normalized counts
-countDataNorm <- countData %>%
-    merge(libSizes, all.x=T) %>%
-    group_by(feature_type, protein, timepoint) %>%
-    mutate(tag_fpkm=(1E9*tag_count)/(feature_length*library_size))
-#    mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
-#    mutate(tag_fpkm100=100*tag_fpkm)
-
-#' apply fpkm normalization for expression profile visualzation
-countDataNorm %>% sample_n(10) %>% kable()
-
-countDataNorm %$% unique(sample)
-
-countDataNorm %>%
-    sample_n(10000) %>%
-    ggplot(aes(tag_fpkm)) +
-        geom_histogram() +
-        scale_x_log10(labels=comma) +
-        ggtitle("fpkm dist overview")
-
-
-#+ fig.width=20, fig.height=18
-countDataNorm %>% ggplot(aes(tag_fpkm)) +
-    geom_histogram() +
-    facet_grid(feature_type ~ sample) +
-    scale_x_log10() + ggtitle("FPKM distirbution by sample and region type")
-
-
-#' Where is more signal
-fpkmSummary <- countDataNorm %>% group_by(feature_type, sample) %>%  summarize(total_fpkm=sum(tag_fpkm))
-fpkmSummary %>% ggplot(aes(sample, total_fpkm, fill=feature_type)) +
-    geom_bar(position="dodge", stat="identity") +
-    ggtitle("ChIP Signal Enrichment by Region")
-
-
-## export a count matrices for dba
-exportRegion="tss_2kb"
-countDataNorm %>% ungroup() %>%
-    ## later do for each feature type
-#    group_by(feature_type
-    filter(feature_type==exportRegion) %>%
-    ## todo fix replicate hack here
-    transmute(ensembl_gene_id, tag_count, sample=paste0(sample, ".1")) %>%
-    spread(sample, tag_count)  %>%
-    write.delim(paste0("replicate_counts.", exportRegion, ".txt"))
-
-write.delim(countDataNorm, file="countDataNorm.txt")
-# countDataNorm <- read.delim("countDataNorm.txt")
-#'  [countDataNorm](countDataNorm.txt)
-
-
-with(countDataNorm, as.data.frame(table(timepoint, protein)))
-
-
-
-#' Performing differential binding analysis for region type `r exportRegion`
-countMatrix <- countData %>% filter(feature_type==exportRegion) %>%
-#    group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
-    dcast(ensembl_gene_id ~ sample, value.var="tag_count") %>%
-    column2rownames("ensembl_gene_id") %>% as.matrix()
-countMatrix[is.na(countMatrix)] <- 0
-cor(countMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab() + ggtitle("count correlation")
-
-fpkmMatrix <- countDataNorm %>% filter(feature_type==exportRegion) %>%
-#    group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
-    dcast(ensembl_gene_id ~ sample, value.var="tag_fpkm") %>%
-    column2rownames("ensembl_gene_id") %>% as.matrix()
-fpkmMatrix[is.na(fpkmMatrix)] <- 0
-cor(fpkmMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab()+ ggtitle("fpkm correlation")
-
-
-#' Correlation Scatter Plots
-
-## all vs all correlation
-allCor <- countDataNorm %>% ungroup() %>%
-    filter(feature_type==exportRegion) %>%
-    select(ensembl_gene_id, tag_fpkm, sample) %>%
-    merge(., ., by=c("ensembl_gene_id")) %>%
-    rename(sample_1=sample.x, sample_2=sample.y)
-
-
-allCor %>% ggplot(aes(tag_fpkm.x, tag_fpkm.y)) +
-        geom_point(alpha=0.05) +
-        ggtitle("Timepoint Correlation") +
-        facet_grid(sample_1~sample_2)+
-    #        facet_wrap(timepoint+sample_1~sample_2)+
-    #     scale_x_log10() + scale_y_log10()
-         scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
-         geom_abline(slope = 1, alpha=0.3, size=2, color="red") +
-         xlab("fpkm") + ylab("fpkm")
-
-#' Timepoint correlation
-timeCor <- countDataNorm %>% ungroup() %>%
-    filter(feature_type=="tss_2kb") %>%
-    select(ensembl_gene_id, tag_fpkm, protein, timepoint) %>%
-    merge(., ., by=c("ensembl_gene_id", "protein")) %>%
-    filter(ac(timepoint.x)>ac(timepoint.y)) %>%
-    rename(sample_1=timepoint.x, sample_2=timepoint.y)
-
-timeCorPlot <- timeCor %>% ggplot(aes(tag_fpkm.x, tag_fpkm.y)) +
-    geom_point(alpha=0.05) +
-    ggtitle("Timepoint Correlation") +
-    facet_grid(sample_1~protein+sample_2)+
-#        facet_wrap(timepoint+sample_1~sample_2)+
-#     scale_x_log10() + scale_y_log10()
-     scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
-     geom_abline(slope = 1, alpha=0.3, size=2, color="red") +
-     xlab("fpkm") + ylab("fpkm")
-
-timeCorPlot
-
-ggsave2(timeCorPlot, outputFormat="pdf", width=12, height=10)
-#
-#corrPlot <- function(fpkmData){
-##    fpkmData <- mmusDiff
-#    hitCounts <- subset(fpkmData, isHit) %>%
-#        with(as.data.frame(table(sample_1, sample_2, sample_1_overex=log2_fold_change<0))) %>%
-#        subset(Freq>0) %>% mutate(sample_1_overex=as.logical(sample_1_overex))
-#
-#    ggplot(fpkmData, aes(value_1, value_2, color=isHit)) +
-#        scale_color_manual(values=c("darkgrey", "red"))+
-##        scale_size_manual(values=c(0.4, 4)) +
-#        geom_point(alpha=0.6, size=2) +
-##    scale_x_log10(labels=comma) + scale_y_log10(labels=comma) +
-#        scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
-#        facet_grid(sample_1 ~ sample_2) +
-#        geom_text(aes(label=Freq), color="black", data=hitCounts %>% mutate(value_1=500., value_2=0.1), subset=.(sample_1_overex)) +
-#        geom_text(aes(label=Freq),color="black", data=hitCounts %>% mutate(value_1=0.1, value_2=500.), subset=.(!sample_1_overex)) +
-#        theme_bw() + xlab("") + ylab("") + guides(colour = guide_legend(override.aes = list(size=4)))
-#}
-#
-#mmusFpkmCorr <- allDiff %>% arrange(isHit) %>% corrPlot() + ggtitle("Mouse FPKM Correlation") + facet_grid(; mmusFpkmCorr
-#
-#ggsave2(mmusFpkmCorr, outputFormat="png", width=9, height=7)
\ No newline at end of file