#!/usr/bin/env Rscript devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R") #devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R") devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R") require(tidyr) require(knitr) require.auto(digest) #' # Region Count Analysis ## spin.R geneInfo <- quote({ mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl")) c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>% biomaRt::getBM(mart=mart) }) %>% cache_it() countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){ read.delim(countFile, header=F) %>% set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>% mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1)) }) %>% ## calcualte feature length for normalization mutate(feature_length=feature_end-feature_start+1) %>% ## discard unused columns select(-c(score, strand, feature_start, feature_end)) %>% ## extract time and protein separate(sample, c("protein", "timepoint"), remove=F) #' count data structure countData %>% sample_n(10) %>% kable() write.delim(countData, file="countData.txt") # countData <- read.delim("countData.txt") #' [countData](countData.txt) #corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>% dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count") #require(GGally) #corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs() ## does not work because of logarithmic scale ## also load library sizes for normalization libSizes <- read.delim("algn_counts.txt", header=F) %>% set_names(c("library_size", "sample")) ## calculate fpkm normalized counts countDataNorm <- countData %>% merge(libSizes, all.x=T) %>% group_by(feature_type, protein, timepoint) %>% mutate(tag_fpkm=(1E9*tag_count)/(feature_length*library_size)) # mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>% # mutate(tag_fpkm100=100*tag_fpkm) countDataNorm %>% sample_n(10000) %>% ggplot(aes(tag_fpkm)) + geom_histogram() + scale_x_log10(labels=comma) + ggtitle("fpkm dist overview") #+ fig.width=20, fig.height=18 countDataNorm %>% ggplot(aes(tag_fpkm)) + geom_histogram() + facet_grid(feature_type ~ sample) + scale_x_log10() write.delim(countDataNorm, file="countDataNorm.txt") # countDataNorm <- read.delim("countDataNorm.txt") #' [countDataNorm](countDataNorm.txt) ## simple correlation timeCor <- countDataNorm %>% ungroup() %>% filter(feature_type=="tss_2kb") %>% select(ensembl_gene_id, tag_fpkm, protein, timepoint) %>% merge(., ., by=c("ensembl_gene_id", "protein")) %>% filter(ac(timepoint.x)>ac(timepoint.y)) %>% rename(sample_1=timepoint.x, sample_2=timepoint.y) timeCorPlot <- timeCor %>% ggplot(aes(tag_fpkm.x, tag_fpkm.y)) + geom_point(alpha=0.05) + ggtitle("Timepoint Correlation") + facet_grid(sample_1~protein+sample_2)+ # facet_wrap(timepoint+sample_1~sample_2)+ # scale_x_log10() + scale_y_log10() scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) + geom_abline(slope = 1, alpha=0.3, size=2, color="red") + xlab("fpkm") + ylab("ylab") ggsave2(timeCorPlot, outputFormat="pdf", width=12, height=10) # #corrPlot <- function(fpkmData){ ## fpkmData <- mmusDiff # hitCounts <- subset(fpkmData, isHit) %>% # with(as.data.frame(table(sample_1, sample_2, sample_1_overex=log2_fold_change<0))) %>% # subset(Freq>0) %>% mutate(sample_1_overex=as.logical(sample_1_overex)) # # ggplot(fpkmData, aes(value_1, value_2, color=isHit)) + # scale_color_manual(values=c("darkgrey", "red"))+ ## scale_size_manual(values=c(0.4, 4)) + # geom_point(alpha=0.6, size=2) + ## scale_x_log10(labels=comma) + scale_y_log10(labels=comma) + # scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) + # facet_grid(sample_1 ~ sample_2) + # geom_text(aes(label=Freq), color="black", data=hitCounts %>% mutate(value_1=500., value_2=0.1), subset=.(sample_1_overex)) + # geom_text(aes(label=Freq),color="black", data=hitCounts %>% mutate(value_1=0.1, value_2=500.), subset=.(!sample_1_overex)) + # theme_bw() + xlab("") + ylab("") + guides(colour = guide_legend(override.aes = list(size=4))) #} # #mmusFpkmCorr <- allDiff %>% arrange(isHit) %>% corrPlot() + ggtitle("Mouse FPKM Correlation") + facet_grid(; mmusFpkmCorr # #ggsave2(mmusFpkmCorr, outputFormat="png", width=9, height=7)