Something went wrong on our end
-
Holger Brandl authoredHolger Brandl authored
peak_report.R 5.45 KiB
#!/usr/bin/env Rscript
#+ echo=FALSE
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
require(knitr)
require.auto(tidyr)
require(tidyr)
#if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
########################################################################################################################
#' # Called Peaks Overview
## https://github.com/taoliu/MACS/
readMacsPeaks <- function(peakFile){
data.table::fread(peakFile, header=F) %>% as.df() %>% mutate(file=basename(peakFile))
}
#5th column: -10*log10pvalue of peak region" (see http://seqanswers.com/forums/showthread.php?t=18674)
## for spec see https://github.com/taoliu/MACS/#output-files
n_peaks <- list.files(".", "*.narrowPeak.*", full.names=TRUE, recursive=T) %>%
ldply(readMacsPeaks) %>%
set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file")) %>%
select(-sum_rel_to_start)
b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>%
ldply(readMacsPeaks) %>%
set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file"))
peaks <- rbind_list(n_peaks, b_peaks)
peaks %<>% separate(file, c("sample", "peak_type", "ext"), sep="[.]") %>% select(-ext)
splitProtTime <- . %>% separate(sample, c("protein", "timpoint"), sep="_", remove=F)
#peaks %>% splitProtTime %>% head
peaks %<>% mutate(peak_width=end_position-start_position)
save(peaks, file="peaks.RData")
# peaks <- local(get(load("peaks.RData")))
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(peak_width)) +
geom_histogram() +
facet_grid(sample ~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak width distribution") +
xlim(0,1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0,1000) +
rotXlab()
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, score)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(yintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0,100) +
rotXlab()
# note: max: score ~ 10*qvalue
#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts")
peaks %>% sample_frac(0.1) %>%
group_by(sample) %>%
arrange(-score) %>%
mutate(x=row_number()/n()) %>%
# sample_frac(0.3) %>%
ggplot(aes(x, qvalue, color=sample)) +
geom_line() +
facet_wrap(~ peak_type) +
xlim(0, 0.2) + ylim(0,50)
########################################################################################################################
#' # ChippeakAnno
# see [docs](http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf)
#+ eval=FALSE, echo=FALSE
## DEBUG start
if(F){
#http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf
require(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)
## https://www.biostars.org/p/115101/
## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R
peakSet <- peaks %>% sample_n(10000) %>% filter(sample=="H3HA_Dome", peak_type=="narrow_peaks")
peakRanges <-with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name))
### other feature types
#Anno_exon = getAnnotation(mart, featureType=c("Exon"))
#Anno_TSS = getAnnotation(mart, featureType=c("TSS"))
#Anno_miRNA = getAnnotation(mart, featureType=c("miRNA"))
#annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon)
annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)
annotatedPeak %>% as.df()%>% ggplot(aes(insideFeature)) + geom_bar()
}
## DEBUG end
#+
require(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)
## apply csa to all sets
annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){
peakRanges <- somePeaks %>%
sample_n(subsample) %$%
RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) %>% as.df()
}
annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
save(annoPeaks, file="annoPeaks.RData")
# annoPeaks <- local(get(load("annoPeaks.RData")))
#' CSA adds ovelap category, nearest gene id and distance to TSS
#+ results='asis'
annoPeaks %>% ungroup() %>% sample_n(10) %>% kable()
#+
#with(peaks, as.data.frame(table(sample, peak_type)))
#peaks %$% as.data.frame(table(sample, peak_type))
annoPeaks %>%
# as.data.frame(table(sample, peak_type, insideFeature)) %>%
ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type)