Skip to content
Snippets Groups Projects
Commit 102187c6 authored by Holger Brandl's avatar Holger Brandl
Browse files

split up deseq report

parent 838fe7f8
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env Rscript
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
require(tidyr)
require(knitr)
require.auto(digest)
#' # Region Count Analysis
## spin.R
geneInfo <- quote({
mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>%
cache_it()
countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
}) %>%
## calcualte feature length for normalization
mutate(feature_length=feature_end-feature_start+1) %>%
## discard unused columns
select(-c(score, strand, feature_start, feature_end)) %>%
## extract time and protein
separate(sample, c("protein", "timepoint"), remove=F)
write.delim(countData, file="countData.txt")
# countData <- read.delim("countData.txt")
#' [countData](countData.txt)
countData %>% head() %>% kable()
#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>% dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
#require(GGally)
#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
## does not work because of logarithmic scale
## calculate fpkm normalized counts
countDataNorm <- countData %>%
group_by(feature_type, protein, timepoint) %>%
mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
mutate(tag_fpkm100=100*tag_fpkm)
#+ fig.width=20, fig.height=18
countDataNorm %>% ggplot(aes(tag_fpkm)) +
geom_histogram() +
facet_grid(feature_type ~ sample) +
scale_x_log10()
write.delim(countDataNorm, file="countDataNorm.txt")
# countDataNorm <- read.delim("countDataNorm.txt")
#' [countDataNorm](countDataNorm.txt)
...@@ -10,7 +10,6 @@ require(knitr) ...@@ -10,7 +10,6 @@ require(knitr)
require.auto(digest) require.auto(digest)
#' # Region Count Analysis #' # Region Count Analysis
## spin.R ## spin.R
...@@ -22,50 +21,6 @@ geneInfo <- quote({ ...@@ -22,50 +21,6 @@ geneInfo <- quote({
cache_it() cache_it()
countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
}) %>%
## calcualte feature length for normalization
mutate(feature_length=feature_end-feature_start+1) %>%
## discard unused columns
select(-c(score, strand, feature_start, feature_end)) %>%
## extract time and protein
separate(sample, c("protein", "timepoint"), remove=F)
write.delim(countData, file="countData.txt")
# countData <- read.delim("countData.txt")
#' [countData](countData.txt)
countData %>% head() %>% kable()
#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>% dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
#require(GGally)
#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
## does not work because of logarithmic scale
## calculate fpkm normalized counts
countDataNorm <- countData %>%
group_by(feature_type, protein, timepoint) %>%
mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
mutate(tag_fpkm100=100*tag_fpkm)
#+ fig.width=20, fig.height=18
countDataNorm %>% ggplot(aes(tag_fpkm)) +
geom_histogram() +
facet_grid(feature_type ~ sample) +
scale_x_log10()
write.delim(countDataNorm, file="countDataNorm.txt")
# countDataNorm <- read.delim("countDataNorm.txt")
#' [countDataNorm](countDataNorm.txt)
######################################################################################################################## ########################################################################################################################
#' # Correlation Analysis #' # Correlation Analysis
......
#!/usr/bin/env Rscript
#+ echo=FALSE
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
...@@ -5,7 +7,7 @@ devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v ...@@ -5,7 +7,7 @@ devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v
require(knitr) require(knitr)
require.auto(tidyr) require.auto(tidyr)
require(tidyr) require(tidyr)
if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/ #if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2") ## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
...@@ -76,15 +78,9 @@ peaks %>% ...@@ -76,15 +78,9 @@ peaks %>%
ylim(0,100) + ylim(0,100) +
rotXlab() rotXlab()
peaks %>%
# sample_frac(0.3) %>% # note: max: score ~ 10*qvalue
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0,1000)
#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position)) #peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts") peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts")
...@@ -94,9 +90,6 @@ peaks %>% sample_frac(0.1) %>% ...@@ -94,9 +90,6 @@ peaks %>% sample_frac(0.1) %>%
group_by(sample) %>% group_by(sample) %>%
arrange(-score) %>% arrange(-score) %>%
mutate(x=row_number()/n()) %>% mutate(x=row_number()/n()) %>%
# note: max: score ~ 10*qvalue
# sample_frac(0.3) %>% # sample_frac(0.3) %>%
ggplot(aes(x, qvalue, color=sample)) + ggplot(aes(x, qvalue, color=sample)) +
geom_line() + geom_line() +
...@@ -146,8 +139,7 @@ annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){ ...@@ -146,8 +139,7 @@ annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){
sample_n(subsample) %$% sample_n(subsample) %$%
RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name) RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) %>% as.df()
annotatedPeak %>% as.df()
} }
annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.)) annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
...@@ -155,10 +147,15 @@ annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.)) ...@@ -155,10 +147,15 @@ annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
save(annoPeaks, file="annoPeaks.RData") save(annoPeaks, file="annoPeaks.RData")
# annoPeaks <- local(get(load("annoPeaks.RData"))) # annoPeaks <- local(get(load("annoPeaks.RData")))
with(peaks, as.data.frame(table(sample, peak_type)))
peaks %$% as.data.frame(table(sample, peak_type)) %>% head
annoPeaks %>% head(20) %>% kable() #' CSA adds ovelap category, nearest gene id and distance to TSS
annoPeaks %>% ungroup %$% #+ results='asis'
as.data.frame(table(sample, peak_type, insideFeature)) %>% head annoPeaks %>% ungroup() %>% sample_n(10) %>% kable()
ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type)
\ No newline at end of file #+
#with(peaks, as.data.frame(table(sample, peak_type)))
#peaks %$% as.data.frame(table(sample, peak_type))
annoPeaks %>%
# as.data.frame(table(sample, peak_type, insideFeature)) %>%
ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment