cont cs analysis

2109980a · Holger Brandl · 4943ea05 · 2109980a · 2109980a
Commit 2109980a authored 10 years ago by Holger Brandl
--- a/chipseq_workflow/cs_region_dba.R
+++ b/chipseq_workflow/cs_region_dba.R
+#!/usr/bin/env Rscript
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
+#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/bio/diffex_commons.R")
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/bio/diffex_commons.R")
+require(tidyr)
+require(knitr)
+require.auto(digest)
+#' # Region Count Analysis
+## spin.R
+geneInfo <- quote({
+        mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
+        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
+            biomaRt::getBM(mart=mart)
+    }) %>%
+    cache_it()
+countData <- list.files(".", "region_counts.txt", f=T) %>% ldply(function(countFile){
+        read.delim(countFile, header=F) %>%
+            set_names(c("chromosome_name", "feature_start", "feature_end", "ensembl_gene_id", "score", "strand", "feature_type", "tag_count")) %>%
+            mutate(sample=basename(countFile) %>% str_split_fixed("[.]",2) %>% subset(select=1))
+    }) %>%
+    ## calcualte feature length for normalization
+    mutate(feature_length=feature_end-feature_start+1) %>%
+    ## discard unused columns
+    select(-c(score, strand, feature_start, feature_end)) %>%
+    ## extract time and protein
+    separate(sample, c("protein", "timepoint"), remove=F)
+write.delim(countData, file="countData.txt")
+# countData <- read.delim("countData.txt")
+#'  [countData](countData.txt)
+countData %>% head() %>% kable()
+#corMat <- countData %>% select(ensembl_gene_id, feature_type, tag_count, sample) %>%  dcast(ensembl_gene_id + feature_type ~ sample, value.var="tag_count")
+#require(GGally)
+#corMat %>% filter(feature_type=="tss_1kb") %>% select(-feature_type) %>% ggpairs()
+## does not work because of logarithmic scale
+## calculate fpkm normalized counts
+countDataNorm <- countData %>%
+    group_by(feature_type, protein, timepoint) %>%
+    mutate(tag_fpkm=(1E9*tag_count)/(feature_length*sum(tag_count))) %>%
+    mutate(tag_fpkm100=100*tag_fpkm)
+#+ fig.width=20, fig.height=18
+countDataNorm %>% ggplot(aes(tag_fpkm)) +
+    geom_histogram() +
+    facet_grid(feature_type ~ sample) +
+    scale_x_log10()
+write.delim(countDataNorm, file="countDataNorm.txt")
+# countDataNorm <- read.delim("countDataNorm.txt")
+#'  [countDataNorm](countDataNorm.txt)
+########################################################################################################################
+#' # Correlation Analysis
+#countMatrix <- matrix(1:400,ncol=2)
+#colData <- data.frame(condition=factor(c("a","b")))
+regionTypeDBA <- "tss_1kb"
+#' Performing differential binding analysis for region type `r regionTypeDBA`
+countMatrix <- countData %>% filter(feature_type==regionTypeDBA) %>%
+#    group_by(ensembl_gene_id) %>% filter(max(tag_count)>0) %>%
+    dcast(ensembl_gene_id ~ sample, value.var="tag_count") %>%
+    column2rownames("ensembl_gene_id") %>% as.matrix()
+countMatrix[is.na(countMatrix)] <- 0
+cor(countMatrix, method="spearman") %>% melt() %>% ggplot(aes(Var1, Var2, fill=value)) + geom_tile() + rotXlab()
+########################################################################################################################
+#' # Differential binding analysis
+hasContastsFile=F
+if(hasContastsFile){
+    contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
+}else{
+    contrasts <- data.frame(sample=colnames(countMatrix)) %>%
+        merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
+        filter(ac(sample_1)>ac(sample_2)) %>%
+#        filter(ac(sample_1)!=ac(sample_2)) %>%
+        fac2char
+    write.delim(contrasts.txt, "contrasts.txt")
+}
+#+ results='asis'
+#' Used contrasts model is
+contrasts %>% kable()
+# See deseq [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
+require.auto(DESeq2)
+require.auto(gplots)
+## todo make sure that control comes first to get fold-changes right
+#Note: In order to benefit from the default settings of the package, you should put the variable of interest
+#at the end of the formula and make sure the control level is the first level.
+colData <- data.frame(condition=colnames(countMatrix))
+dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
+#dds <- DESeq(dds)
+dds <- DESeq(dds, fitType='local')
+res <- results(dds)
+resultsNames(dds)
+summary(res)
+deResults <- adply(contrasts, 1,  splat(function(sample_1, sample_2){
+  #  browser()
+   results(dds, contrast=c("condition",sample_1,sample_2)) %>%
+        rownames2column("ensembl_gene_id") %>%
+        as.data.frame() %>%
+        mutate(
+            sample_1=sample_1,
+            sample_2=sample_2,
+            sample_1_overex=log2FoldChange<0
+        )
+}), .progress="text")
+deResults %>% with(as.data.frame(table(sample_1, sample_2)))
+#+ fig.width=20, fig.height=18
+deResults %>% ggplot(aes(log2FoldChange)) +
+    geom_histogram(binwidth=0.1) +
+    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
+    xlim(-4,4)
+### see https://www.biostars.org/p/80448/ for Pairwise Comparison
+#res <- results(dds, contrast=c(c("condition","H1M_Dome","H1M_Shield")))
+#res <- results(dds, contrast=c(c("condition","H3K4_Shield","H1M_Shield")))
+#res <- results(dds, contrast=c(c("condition","H1M_Shield", "H3K4_Shield")))
+deResults %<>% mutate(is_hit=pvalue<0.01)
+degs <- deResults %>% filter(is_hit)
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary") + coord_flip()
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=log2FoldChange>1)) + geom_bar(position="dodge") +xlab(NULL) + ylab("# DBGs") +ggtitle("DBG count summary by direction of expression") + coord_flip()
+## just needed to restore environment easily
+save(degs, file=".degs.RData")
+# degs <- local(get(load(".degs.RData")))
+#res %>% as.df() %>% ggplot(aes(pvalue)) + geom_histogram()
+#res %>% as.df() %>% ggplot(aes(padj)) + geom_histogram()
+plotMA(res, main="DESeq2", ylim=c(-2,2))
+## note see  ?DESeq section on 'Experiments without replicates'
+## hwo to specify which conditions to run the test on
+## http://seqanswers.com/forums/showthread.php?t=28350
+#For making comparisons of multiple conditions (not only against the base level of a condition), we have recently implemented contrasts in the development branch. This allows one to fit a single model, then generate log2 fold change estimates, standard errors and tests of null hypotheses for other comparisons.
+#+ eval=FALSE, echo=FALSE, include=F
+## DEBUG start
+if(F){
+## DEBUG end
+## base means (see http://seqanswers.com/forums/showthread.php?t=28350&page=2)
+baseMeanPerLvl <- sapply(levels(colData(dds)$condition), function(lvl) rowMeans(counts(dds,normalized=TRUE)[,colData(dds)$condition == lvl]))
+counts(dds,normalized=TRUE) %>% as.df() %>% set_names(colData(dds)$condition) %>%
+    rownames2column("ensembl_gene_id") %>%
+    filter(ensembl_gene_id=="ENSDARG00000098036")
+countDataNorm %>% filter(ensembl_gene_id=="ENSDARG00000000324", sample %in% c("H3HA_Oblong", "H3HA_Dome"), feature_type==regionTypeDBA)
+#baseMean log2FoldChange     lfcSE       stat     pvalue      padj    ensembl_gene_id    sample_1  sample_2 sample_1_overex
+#19  12.529275     0.95396312 0.6345064  1.5034728 0.13271717 0.8131601 ENSDARG00000000324 H3HA_Oblong H3HA_Dome           FALSE
+}
+########################################################################################################################
+#' ## Term enrichment
+#+ echo=FALSE
+#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
+geneLists <- degs %>%
+#    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
+    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
+split_hit_list <- F
+grpdDegs <- if(split_hit_list){
+    degs %>% group_by(sample_1, sample_2, sample_1_overex)
+}else{
+    degs %>% group_by(sample_1, sample_2)
+}
+enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
+write.delim(enrResults, file="enrResults.txt")
+# enrResults <- read.delim("enrResults.txt")
+#'  [Enrichment Results](enrResults.txt)
+sigEnrResults <- subset(enrResults, Bonferroni <0.01)
+write.delim(sigEnrResults, file="sigEnrResults.txt")
+# sigEnrResults <- read.delim("sigEnrResults.txt")
+#'  [Very Significant Terms](sigEnrResults.txt)
+## plot the enrichment results
+#sigEnrResults %>% group_by(Category, add=T) %>% do({
+#    logPlot <- . %>% ggplot(aes(Term, PValue)) +
+#	    geom_bar(stat="identity")+coord_flip() +
+#	    xlab("Enriched Terms") +
+#	    ggtitle(.$Category[1]) +
+#	    scale_y_log10()
+#	    print(logPlot)
+#})
+sigEnrResults %>% do({
+    enrResultsGrp <- .
+    ## DEBUG enrResultsGrp <- sigEnrResults
+    label <-
+    dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
+      head(1) %>% apply(1, paste, collapse="_vs_")
+    echo("processing", label)
+    logPlot <- enrResultsGrp %>%
+        ## fix factor order
+        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
+        ggplot(aes(Term, PValue, fill=Category)) +
+	    geom_bar(stat="identity")+
+	    coord_flip() +
+	    xlab("Enriched Terms") +
+	    ggtitle(label) +
+	    scale_y_log10()
+	    ggsave(paste0(label, " enrichmed terms.pdf"))
+	    print(logPlot)
+})
+#ggsave2()
--- a/chipseq_workflow/peak_report.R
+++ b/chipseq_workflow/peak_report.R
@@ -7,7 +7,7 @@ require.auto(tidyr)
 require(tidyr)
 if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
-## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2__OLD_SORTING")
+## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
 ########################################################################################################################
 #' # Called Peaks Overview
@@ -49,6 +49,29 @@ peaks %>%
        ggtitle("peak width distribution") +
        xlim(0,1000)
+peaks %>%
+#    sample_frac(0.3) %>%
+    ggplot(aes(sample, peak_width)) +
+        geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
+        geom_violin(alpha=0.8) +
+        facet_wrap(~ peak_type)  +
+        # + geom_vline(yintercept=0, color="blue")
+        ggtitle("peak width distribution") +
+        ylim(0,1000) +
+        rotXlab()
+peaks %>%
+#    sample_frac(0.3) %>%
+    ggplot(aes(sample, score)) +
+        geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
+        geom_violin(alpha=0.8) +
+        facet_wrap(~ peak_type)  +
+        # + geom_vline(yintercept=0, color="blue")
+        ggtitle("peak score distribution") +
+        ylim(0,100) +
+        rotXlab()
 peaks %>%
 #    sample_frac(0.3) %>%
    ggplot(aes(sample, peak_width)) +
@@ -60,13 +83,31 @@ peaks %>%
        ylim(0,1000)
 #peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
-peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar() + ggtitle("peak counts")
+peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts")
-peaks %>%
+peaks %>%  sample_frac(0.1) %>%
-    sample_fract(0.1) %>%
+    group_by(sample) %>%
-    arrange(-qvalue) %>% head
+    arrange(-score) %>%
+    mutate(x=row_number()/n()) %>%
 #    sample_frac(0.3) %>%
-    ggplot(aes(qvalue, group=sample)) +
+    ggplot(aes(x, qvalue, color=sample)) +
        geom_line() +
-        facet_wrap(~ peak_type)
+        facet_wrap(~ peak_type) +
+        xlim(0, 0.2) + ylim(0,50)
+#
+## chippeak anno
+#+ eval=FALSE, echo=FALSE
+## DEBUG start
+if(F){
+## https://www.biostars.org/p/115101/
+mydata.ranged<-BED2RangedData("hmedip_allpeaks.bed",header=FALSE)
+#http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf
+}
+## DEBUG end
\ No newline at end of file