split up deseq report

102187c6 · Holger Brandl · 838fe7f8 · 102187c6 · 102187c6 · 102187c6
Commit 102187c6 authored 10 years ago by Holger Brandl
--- a/chipseq_workflow/cs_compare_regions.R
+++ b/chipseq_workflow/cs_compare_regions.R
+#!/usr/bin/env Rscript
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
+#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
+require(tidyr)
+require(knitr)
+require.auto(digest)
+#' # Region Count Analysis
+## spin.R
+geneInfo <- quote({
+        mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
+        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
+            biomaRt::getBM(mart=mart)
+    }) %>%
+    cache_it()
+countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
+        read.delim(countFile, header=F) %>%
+            set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
+            mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
+    }) %>%
+    ## calcualte feature length for normalization
+    mutate(feature_length=feature_end-feature_start+1) %>%
+    ## discard unused columns
+    select(-c(score, strand, feature_start, feature_end)) %>%
+    ## extract time and protein
+    separate(sample, c("protein", "timepoint"), remove=F)
+write.delim(countData, file="countData.txt")
+# countData <- read.delim("countData.txt")
+#'  [countData](countData.txt)
+countData %>% head() %>% kable()
+#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>%  dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
+#require(GGally)
+#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
+## does not work because of logarithmic scale
+## calculate fpkm normalized counts
+countDataNorm <- countData %>%
+    group_by(feature_type, protein, timepoint) %>%
+    mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
+    mutate(tag_fpkm100=100*tag_fpkm)
+#+ fig.width=20, fig.height=18
+countDataNorm %>% ggplot(aes(tag_fpkm)) +
+    geom_histogram() +
+    facet_grid(feature_type ~ sample) +
+    scale_x_log10()
+write.delim(countDataNorm, file="countDataNorm.txt")
+# countDataNorm <- read.delim("countDataNorm.txt")
+#'  [countDataNorm](countDataNorm.txt)
--- a/chipseq_workflow/cs_region_dba.R
+++ b/chipseq_workflow/cs_region_dba.R
@@ -10,7 +10,6 @@ require(knitr)
 require.auto(digest)
 #' # Region Count Analysis
 ## spin.R
@@ -22,50 +21,6 @@ geneInfo <- quote({
    cache_it()
-countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
-        read.delim(countFile, header=F) %>%
-            set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
-            mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
-    }) %>%
-    ## calcualte feature length for normalization
-    mutate(feature_length=feature_end-feature_start+1) %>%
-    ## discard unused columns
-    select(-c(score, strand, feature_start, feature_end)) %>%
-    ## extract time and protein
-    separate(sample, c("protein", "timepoint"), remove=F)
-write.delim(countData, file="countData.txt")
-# countData <- read.delim("countData.txt")
-#'  [countData](countData.txt)
-countData %>% head() %>% kable()
-#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>%  dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
-#require(GGally)
-#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
-## does not work because of logarithmic scale
-## calculate fpkm normalized counts
-countDataNorm <- countData %>%
-    group_by(feature_type, protein, timepoint) %>%
-    mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
-    mutate(tag_fpkm100=100*tag_fpkm)
-#+ fig.width=20, fig.height=18
-countDataNorm %>% ggplot(aes(tag_fpkm)) +
-    geom_histogram() +
-    facet_grid(feature_type ~ sample) +
-    scale_x_log10()
-write.delim(countDataNorm, file="countDataNorm.txt")
-# countDataNorm <- read.delim("countDataNorm.txt")
-#'  [countDataNorm](countDataNorm.txt)
 ########################################################################################################################
 #' # Correlation Analysis

--- a/chipseq_workflow/peak_report.R
+++ b/chipseq_workflow/peak_report.R
+#!/usr/bin/env Rscript
+#+ echo=FALSE
 devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
 devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
@@ -5,7 +7,7 @@ devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v
 require(knitr)
 require.auto(tidyr)
 require(tidyr)
-if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
+#if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
 ## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
@@ -76,15 +78,9 @@ peaks %>%
        ylim(0,100) +
        rotXlab()
-peaks %>%
-#    sample_frac(0.3) %>%
+# note: max: score ~ 10*qvalue
-    ggplot(aes(sample, peak_width)) +
-        geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
-        geom_violin(alpha=0.8) +
-        facet_wrap(~ peak_type)  +
-        # + geom_vline(yintercept=0, color="blue")
-        ggtitle("peak width distribution") +
-        ylim(0,1000)
 #peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
 peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts")
@@ -94,9 +90,6 @@ peaks %>%  sample_frac(0.1) %>%
    group_by(sample) %>%
    arrange(-score) %>%
    mutate(x=row_number()/n()) %>%
-# note: max: score ~ 10*qvalue
 #    sample_frac(0.3) %>%
    ggplot(aes(x, qvalue, color=sample)) +
        geom_line() +
@@ -146,8 +139,7 @@ annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){
        sample_n(subsample) %$%
        RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
-    annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)
+    annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) %>% as.df()
-    annotatedPeak %>% as.df()
 }
 annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
@@ -155,10 +147,15 @@ annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
 save(annoPeaks, file="annoPeaks.RData")
 # annoPeaks <- local(get(load("annoPeaks.RData")))
-with(peaks, as.data.frame(table(sample, peak_type)))
-peaks %$% as.data.frame(table(sample, peak_type)) %>% head
-annoPeaks %>% head(20) %>% kable()
+#' CSA adds ovelap category, nearest gene id and distance to TSS
-annoPeaks %>% ungroup %$%
+#+ results='asis'
-    as.data.frame(table(sample, peak_type, insideFeature)) %>% head
+annoPeaks %>% ungroup() %>% sample_n(10) %>% kable()
-    ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type)
\ No newline at end of file
+#+
+#with(peaks, as.data.frame(table(sample, peak_type)))
+#peaks %$% as.data.frame(table(sample, peak_type))
+annoPeaks %>%
+#    as.data.frame(table(sample, peak_type, insideFeature)) %>%
+    ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type)