Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env Rscript
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
require.auto(digest)
#' # Region Count Analysis
## spin.R
geneInfo <- quote({
mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>%
cache_it()
countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
}) %>%
## calcualte feature length for normalization
mutate(feature_length=feature_end-feature_start+1) %>%
## discard unused columns
select(-c(score, strand, feature_start, feature_end)) %>%
## extract time and protein
separate(sample, c("protein", "timepoint"), remove=F)
#' count data structure
countData %>% sample_n(10) %>% kable()
write.delim(countData, file="countData.txt")
# countData <- read.delim("countData.txt")
#' [countData](countData.txt)
#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>% dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
#require(GGally)
#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
## does not work because of logarithmic scale
## also load library sizes for normalization
libSizes <- read.delim("algn_counts.txt", header=F) %>% set_names(c("library_size", "sample"))
## calculate fpkm normalized counts
countDataNorm <- countData %>%
mutate(tag_fpkm=(1E9*tag_count)/(feature_length*library_size))
# mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
# mutate(tag_fpkm100=100*tag_fpkm)
countDataNorm %>%
sample_n(10000) %>%
ggplot(aes(tag_fpkm)) +
geom_histogram() +
scale_x_log10(labels=comma) +
ggtitle("fpkm dist overview")
#+ fig.width=20, fig.height=18
countDataNorm %>% ggplot(aes(tag_fpkm)) +
geom_histogram() +
facet_grid(feature_type ~ sample) +
scale_x_log10()
write.delim(countDataNorm, file="countDataNorm.txt")
# countDataNorm <- read.delim("countDataNorm.txt")
#' [countDataNorm](countDataNorm.txt)
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
## simple correlation
timeCor <- countDataNorm %>% ungroup() %>%
filter(feature_type=="tss_2kb") %>%
select(ensembl_gene_id, tag_fpkm, protein, timepoint) %>%
merge(., ., by=c("ensembl_gene_id", "protein")) %>%
filter(ac(timepoint.x)>ac(timepoint.y)) %>%
rename(sample_1=timepoint.x, sample_2=timepoint.y)
timeCorPlot <- timeCor %>% ggplot(aes(tag_fpkm.x, tag_fpkm.y)) +
geom_point(alpha=0.05) +
ggtitle("Timepoint Correlation") +
facet_grid(sample_1~protein+sample_2)+
# facet_wrap(timepoint+sample_1~sample_2)+
# scale_x_log10() + scale_y_log10()
scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
geom_abline(slope = 1, alpha=0.3, size=2, color="red") +
xlab("fpkm") + ylab("ylab")
ggsave2(timeCorPlot, outputFormat="pdf", width=12, height=10)
#
#corrPlot <- function(fpkmData){
## fpkmData <- mmusDiff
# hitCounts <- subset(fpkmData, isHit) %>%
# with(as.data.frame(table(sample_1, sample_2, sample_1_overex=log2_fold_change<0))) %>%
# subset(Freq>0) %>% mutate(sample_1_overex=as.logical(sample_1_overex))
#
# ggplot(fpkmData, aes(value_1, value_2, color=isHit)) +
# scale_color_manual(values=c("darkgrey", "red"))+
## scale_size_manual(values=c(0.4, 4)) +
# geom_point(alpha=0.6, size=2) +
## scale_x_log10(labels=comma) + scale_y_log10(labels=comma) +
# scale_x_log10(labels=comma, limits=c(0.01, 10000)) + scale_y_log10(labels=comma, limits=c(0.01, 10000)) +
# facet_grid(sample_1 ~ sample_2) +
# geom_text(aes(label=Freq), color="black", data=hitCounts %>% mutate(value_1=500., value_2=0.1), subset=.(sample_1_overex)) +
# geom_text(aes(label=Freq),color="black", data=hitCounts %>% mutate(value_1=0.1, value_2=500.), subset=.(!sample_1_overex)) +
# theme_bw() + xlab("") + ylab("") + guides(colour = guide_legend(override.aes = list(size=4)))
#}
#
#mmusFpkmCorr <- allDiff %>% arrange(isHit) %>% corrPlot() + ggtitle("Mouse FPKM Correlation") + facet_grid(; mmusFpkmCorr
#
#ggsave2(mmusFpkmCorr, outputFormat="png", width=9, height=7)