Commit a81401bd authored by Holger Brandl's avatar Holger Brandl

fixed peak_report.R for mouse

parent 76c5dd98
#!/usr/bin/env Rscript
#+ echo=FALSE
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.22/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.22/R/ggplot_commons.R")
#+ include=FALSE
suppressMessages(require(docopt))
doc = '
Annotate peaks and create summary report
Usage: peak_report.R [options] <anno_db>
Options:
--subsample <name_prefix> Name to prefix all generated result files [default: ]
'
opts = docopt(doc, commandArgs(TRUE))
# devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.50/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/s/rwnpjpfz0aa9zg0/core_commons.R?dl=0")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.50/R/ggplot_commons.R")
##todo port back into core_commons
# set_names <- purrr::set_names
load_pack(knitr)
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
## process input arguments
anno_db = opts$anno_db
assert(anno_db %in% c("TSS.zebrafish.Zv9", "TSS.mouse.GRCm38"), "invalid annotation descriptor, see https://www.bioconductor.org/packages/devel/bioc/manuals/ChIPpeakAnno/man/ChIPpeakAnno.pdf")
require_auto(knitr)
require_auto(tidyr)
#if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
########################################################################################################################
#' # Called Peaks Overview
......@@ -18,145 +42,185 @@ require_auto(tidyr)
## https://github.com/taoliu/MACS/
readMacsPeaks <- function(peakFile){
data.table::fread(peakFile, header=F) %>% as.df() %>% mutate(file=basename(peakFile))
data.table::fread(peakFile, header = F) %>%
as_df() %>%
mutate(file = basename(peakFile))
}
#5th column: -10*log10pvalue of peak region" (see http://seqanswers.com/forums/showthread.php?t=18674)
## for spec see https://github.com/taoliu/MACS/#output-files
n_peaks <- list.files(".", "*.narrowPeak.*", full.names=TRUE, recursive=T) %>%
ldply(readMacsPeaks) %>%
set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file")) %>%
select(-sum_rel_to_start)
# list.files(".", "*.narrow.*", full.names=TRUE, recursive=T) %>% map(~ file.info(.)$size )
n_peaks <- list.files(".", "*.narrow.*", full.names = TRUE, recursive = T) %>%
purrr::discard(~ file.info(.)$size == 0) %>%
map_df(readMacsPeaks) %>%
set_names("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file") %>%
select(- sum_rel_to_start)
peaks <- n_peaks
broadPeakFiles <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T)
if(length(broadPeakFiles)>0){
b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>%
ldply(readMacsPeaks) %>%
set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file"))
peaks <- bind_rows(peaks, b_peaks)
}
# broadPeakFiles <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T)
# if(length(broadPeakFiles)>0){
# b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>%
# ldply(readMacsPeaks) %>%
# set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file"))
#
# peaks <- bind_rows(peaks, b_peaks)
# }
## todo remove this later
#peaks <- mutate(peaks, file=file %>%str_replace( "_narrow", ".narrow") %>% str_replace("_broad", ".broad"))
peaks %>% count(file) %>% kable()
peaks %<>% separate(file, c("sample", "peak_type", "ext"), sep="[.]") %>% select(-ext)
peaks %<>% separate(file, c("sample", "peak_type"), sep = "[.]") %>% mutate_inplace(sample, str_replace_all("_peaks", ""))
#splitProtTime <- . %>% separate(sample, c("protein", "timpoint"), sep="_", remove=F)
#peaks %>% splitProtTime %>% head
peaks %<>% mutate(peak_width=end_position-start_position)
peaks %<>% mutate(peak_width = end_position - start_position)
save(peaks, file="peaks.RData")
save(peaks, file = "peaks.RData")
# peaks <- local(get(load("peaks.RData")))
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(peak_width)) +
geom_histogram() +
facet_grid(sample ~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
xlim(0,1000)
ggplot(aes(peak_width)) +
geom_histogram() +
facet_grid(sample ~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
xlim(0, 1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0,1000)
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha = 0.05, data = sample_frac(peaks, 0.1)) +
geom_violin(alpha = 0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0, 1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, score)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0,100) +
rot_x_lab()
ggplot(aes(sample, score)) +
geom_jitter(alpha = 0.4, data = sample_frac(peaks, 0.5)) +
geom_violin(alpha = 0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0, 100) +
rot_x_lab()
# note: max: score ~ 10*qvalue
#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts") + rot_x_lab()
peaks %>% ggplot(aes(sample, fill = peak_type)) +
geom_bar(position = "dodge") +
ggtitle("peak counts") +
rot_x_lab()
peaks %>% sample_frac(0.1) %>%
peaks %>%
sample_frac(0.1) %>%
group_by(sample) %>%
arrange(-score) %>%
mutate(x=row_number()/n()) %>%
arrange(- score) %>%
mutate(x = row_number() / n()) %>%
# sample_frac(0.3) %>%
ggplot(aes(x, qvalue, color=sample)) +
geom_line() +
facet_wrap(~ peak_type) +
xlim(0, 0.2) + ylim(0,50) +
rot_x_lab()
ggplot(aes(x, qvalue, color = sample)) +
geom_line() +
facet_wrap(~ peak_type) +
xlim(0, 0.2) +
ylim(0, 50) +
rot_x_lab()
########################################################################################################################
#' # ChippeakAnno
# see [docs](http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf)
require_auto(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)
## too many conflicts
unloadNamespace("conflicted")
#+ eval=FALSE, echo=FALSE
## DEBUG start
if(F){
#http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf
#require_auto(ChIPpeakAnno)
#data(TSS.zebrafish.Zv9)
# load_pack(ChIPpeakAnno)
require(ChIPpeakAnno)
## https://www.biostars.org/p/115101/
## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R
## todo
# data(TSS.zebrafish.Zv9)
# data(TSS.mouse.GRCm38)
peakSet <- peaks %>% sample_n(10000) %>% filter(sample=="H3HA_Dome", peak_type=="narrow_peaks")
peakRanges <-with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name))
#+ eval=FALSE, echo=FALSE
## DEBUG start
if (F) {
# https://www.bioconductor.org/packages/devel/bioc/manuals/ChIPpeakAnno/man/ChIPpeakAnno.pdf
#load_pack(ChIPpeakAnno)
#data(TSS.zebrafish.Zv9)
## https://www.biostars.org/p/115101/
## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R
peakSet <- peaks %>%
sample_n(10000) %>%
filter(sample == "H3HA_Dome", peak_type == "narrow_peaks")
peakRanges <- with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name))
### other feature types
#Anno_exon = getAnnotation(mart, featureType=c("Exon"))
#Anno_TSS = getAnnotation(mart, featureType=c("TSS"))
#Anno_miRNA = getAnnotation(mart, featureType=c("miRNA"))
#annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon)
annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)
annotatedPeak %>%
as.df() %>%
ggplot(aes(insideFeature)) + geom_bar()
}
## DEBUG end
session::save.session(".tmp_peak_report.dat");
# session::restore.session(".tmp_peak_report.dat");
### other feature types
#Anno_exon = getAnnotation(mart, featureType=c("Exon"))
#Anno_TSS = getAnnotation(mart, featureType=c("TSS"))
#Anno_miRNA = getAnnotation(mart, featureType=c("miRNA"))
#annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon)
annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)
annotatedPeak %>% as.df()%>% ggplot(aes(insideFeature)) + geom_bar()
#+
# require(IRanges)
# nrow = base::nrow
# nrow = base::with
# slice = dplyr::slice
}
## DEBUG end
## toto make generic
# assert(eval(substitute(require(pkg), list(pkg = as.name(genome)))), glue::glue("could not load {genome}"))
eval(substitute(data(pkg), list(pkg = as.name(anno_db))))
# slotNames(annotData)
#+
require(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)
# data(TSS.mouse.GRCm38)
# slotNames(TSS.mouse.GRCm38)
# annotData = TSS.mouse.GRCm38
# annotData = TSS.zebrafish.Zv9
# annotData = eval(substitute(pkg, list(pkg = as.name(genome))))
## apply csa to all sets
annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){
annotatePeaks <- function(somePeaks, subsample=min(nrow(somePeaks), 10000)){
peakRanges <- somePeaks %>%
sample_n(subsample) %$%
RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
sample_n(subsample) %$%
RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) %>% as.df()
## todo how to abstract from species here
# annotatePeakInBatch (peakRanges, AnnotationData = substitute(pkg, list(pkg = as.name(anno_db)))) %>% as_df()
# annotatePeakInBatch (peakRanges, AnnotationData = as.name(anno_db)) %>% as_df()
annotatePeakInBatch (peakRanges, AnnotationData = TSS.mouse.GRCm38) %>% as_df()
}
# somePeaks =peaks %>% group_by(sample, peak_type) %>% first_group()
annoPeaks <- peaks %>%
group_by(sample, peak_type) %>%
do(annotatePeaks(.))
annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
save(annoPeaks, file="annoPeaks.RData")
save(annoPeaks, file = "annoPeaks.RData")
# annoPeaks <- local(get(load("annoPeaks.RData")))
## todo merge
......@@ -165,12 +229,24 @@ save(annoPeaks, file="annoPeaks.RData")
#' CSA adds ovelap category, nearest gene id and distance to TSS
#+ results='asis'
annoPeaks %>% ungroup() %>% sample_n(10) %>% kable()
annoPeaks %>%
ungroup() %>%
sample_n(10) %>%
kable()
#+
#with(peaks, as.data.frame(table(sample, peak_type)))
#peaks %$% as.data.frame(table(sample, peak_type))
annoPeaks %>%
# as.data.frame(table(sample, peak_type, insideFeature)) %>%
ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type) + rot_x_lab()
annoPeaks %>% ggplot(aes(sample, fill = insideFeature)) +
geom_bar() +
facet_wrap(~ peak_type) +
rot_x_lab()
write_tsv(annoPeaks, path="annoPeaks.txt")
# annoPeaks <- read_tsv("annoPeaks.txt")
#' [annoPeaks](annoPeaks.txt)
session::save.session(".peak_report.dat");
# session::restore.session(".peak_report.dat");
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment