#!/usr/bin/env Rscript #+ echo=FALSE devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R") require(knitr) require.auto(tidyr) require(tidyr) #if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/ ## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2") ######################################################################################################################## #' # Called Peaks Overview ## https://github.com/taoliu/MACS/ readMacsPeaks <- function(peakFile){ data.table::fread(peakFile, header=F) %>% as.df() %>% mutate(file=basename(peakFile)) } #5th column: -10*log10pvalue of peak region" (see http://seqanswers.com/forums/showthread.php?t=18674) ## for spec see https://github.com/taoliu/MACS/#output-files n_peaks <- list.files(".", "*.narrowPeak.*", full.names=TRUE, recursive=T) %>% ldply(readMacsPeaks) %>% set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file")) %>% select(-sum_rel_to_start) b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>% ldply(readMacsPeaks) %>% set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file")) peaks <- rbind_list(n_peaks, b_peaks) peaks %<>% separate(file, c("sample", "peak_type", "ext"), sep="[.]") %>% select(-ext) splitProtTime <- . %>% separate(sample, c("protein", "timpoint"), sep="_", remove=F) #peaks %>% splitProtTime %>% head peaks %<>% mutate(peak_width=end_position-start_position) save(peaks, file="peaks.RData") # peaks <- local(get(load("peaks.RData"))) peaks %>% # sample_frac(0.3) %>% ggplot(aes(peak_width)) + geom_histogram() + facet_grid(sample ~ peak_type) + # + geom_vline(yintercept=0, color="blue") ggtitle("peak width distribution") + xlim(0,1000) peaks %>% # sample_frac(0.3) %>% ggplot(aes(sample, peak_width)) + geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) + geom_violin(alpha=0.8) + facet_wrap(~ peak_type) + # + geom_vline(yintercept=0, color="blue") ggtitle("peak width distribution") + ylim(0,1000) + rotXlab() peaks %>% # sample_frac(0.3) %>% ggplot(aes(sample, score)) + geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) + geom_violin(alpha=0.8) + facet_wrap(~ peak_type) + # + geom_vline(yintercept=0, color="blue") ggtitle("peak score distribution") + ylim(0,100) + rotXlab() # note: max: score ~ 10*qvalue #peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position)) peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts") peaks %>% sample_frac(0.1) %>% group_by(sample) %>% arrange(-score) %>% mutate(x=row_number()/n()) %>% # sample_frac(0.3) %>% ggplot(aes(x, qvalue, color=sample)) + geom_line() + facet_wrap(~ peak_type) + xlim(0, 0.2) + ylim(0,50) ######################################################################################################################## #' # ChippeakAnno # see [docs](http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf) #+ eval=FALSE, echo=FALSE ## DEBUG start if(F){ #http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf require(ChIPpeakAnno) data(TSS.zebrafish.Zv9) ## https://www.biostars.org/p/115101/ ## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R peakSet <- peaks %>% sample_n(10000) %>% filter(sample=="H3HA_Dome", peak_type=="narrow_peaks") peakRanges <-with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)) ### other feature types #Anno_exon = getAnnotation(mart, featureType=c("Exon")) #Anno_TSS = getAnnotation(mart, featureType=c("TSS")) #Anno_miRNA = getAnnotation(mart, featureType=c("miRNA")) #annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon) annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) annotatedPeak %>% as.df()%>% ggplot(aes(insideFeature)) + geom_bar() } ## DEBUG end #+ require(ChIPpeakAnno) data(TSS.zebrafish.Zv9) ## apply csa to all sets annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){ peakRanges <- somePeaks %>% sample_n(subsample) %$% RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name) annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) %>% as.df() } annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.)) save(annoPeaks, file="annoPeaks.RData") # annoPeaks <- local(get(load("annoPeaks.RData"))) #' CSA adds ovelap category, nearest gene id and distance to TSS #+ results='asis' annoPeaks %>% ungroup() %>% sample_n(10) %>% kable() #+ #with(peaks, as.data.frame(table(sample, peak_type))) #peaks %$% as.data.frame(table(sample, peak_type)) annoPeaks %>% # as.data.frame(table(sample, peak_type, insideFeature)) %>% ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type)