diff --git a/dge_workflow/featcounts_deseq.R b/dge_workflow/featcounts_deseq.R index 84eb776b7dc1db1651d4483e037cb31da6222c6b..f38cd2d329e7d7e0347d9d637463591ead6be484 100755 --- a/dge_workflow/featcounts_deseq.R +++ b/dge_workflow/featcounts_deseq.R @@ -21,7 +21,7 @@ Options: #opts <- docopt(doc, commandArgs(TRUE)) -opts <- docopt(doc, "--pcutoff 0.05 --contrasts ../time_contrasts.txt ../peak_clusters_tss5kb.count_matrix.txt") +opts <- docopt(doc, "--pcutoff 0.05 --contrasts ~/MPI-CBG_work/P5_DESeq/testing/dba_contrasts.txt ~/MPI-CBG_work/P5_DESeq/testing/countMatrix.txt") ## opts <- docopt(doc, "countMatrix.txt") require(knitr) @@ -60,15 +60,7 @@ qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff) if(is.numeric(pcutoff)) opts$qcutoff <- NULL -######################################################################################################################## -#' # Load annotation data -## load transcriptome annotations needed for results annotation -geneInfo <- quote({ - mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl")) - c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>% - biomaRt::getBM(mart=mart) - }) %>% cache_it("geneInfo") ######################################################################################################################## #' # Differential Expresssion Analysis @@ -82,6 +74,8 @@ geneInfo <- quote({ ######################################################################################################################## #' # Differential Expresssion Analysis +#' ## Data preparation + #' Working Directory: `r getwd()` resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix") @@ -96,6 +90,16 @@ nrow(countMatrix) countMatrix %<>% filterByExpression() nrow(countMatrix) +######################################################################################################################## +#' # Load annotation data + +## load transcriptome annotations needed for results annotation +geneInfo <- quote({ + ## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§ + mart <- biomaRt::useDataset(guess_mart(ensembl_gene_id), mart = biomaRt::useMart("ensembl")), + c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>% + biomaRt::getBM(mart=mart) +}) %>% cache_it("geneInfo") #filterByExpression <- function(fpkmMat, min_count=1, logMode=F){ # if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount @@ -250,9 +254,9 @@ maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/nor ##Volcano plots hitCounts <- filter(deResults, is_hit) %>% - count(sample_1, sample_2, sample_1_overex=s1_over_s2_logfc>0) %>% + count(sample_1, sample_2, s1_overex) %>% rename(hits=n) %>% - merge(data.frame(sample_1_overex=c(T,F), x_pos=c(-2.5,2.5))) + merge(data.frame(s1_overex=c(T,F), x_pos=c(-2.5,2.5))) #+ fig.width=16, fig.height=14 deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) + @@ -329,6 +333,66 @@ degs %>% select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>% kable("html", table.attr = "class='dtable'", escape=F) + + + +#' DEGs (differentially expressed genes) are contained in the following table +write.delim(degs, file="degs.txt") +# degs <- read.delim("degs.txt") +#' [Differentially Expressed Genes](degs.txt) + +#' ### DEG Counts +#with(degs, as.data.frame(table(sample_1, sample_2, s1_overex))) %>% filter(Freq >0) %>% kable() +degs %>% count( sample_1, sample_2, s1_overex) %>% kable() + +#+ fig.height=nrow(contrasts)+2 +#ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip() +ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip() + + +#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334. +kableDegs <- degs +if(nrow(degs>2000)){ + kableDegs <- degs %>% arrange(padj) %>% head(1000) + # Error: object 'q_value' not found. So: padj used instead of '-q_value' + print("just showing first 1000 most significant hits (highest p-value) in interactive hit table!!!! Use Excel to browser to browse the full set") +} + +#+ results='asis' +kableDegs %>% + #inner_join(geneLoci) %>% + mutate( + ensembl=paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>"), + igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>") + ) %>% + select(sample_1, sample_2, s1_over_s2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>% + kable("html", table.attr = "class='dtable'", escape=F) + +## just needed to restore environment easily +save(degs, file=".degs.RData") +# degs <- local(get(load(".degs.RData"))) + +##-------------------------------------------------------- + +#' ### MA Plot +#' Redo MA plots but now including hit overlays + +maPlots <- deResults %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-. +## todo why not s2_over_s1_log2fc +maData %>% ggplot(aes(0.5*log2(value_1*value_2), log2(value_1/value_2), color=is_hit)) + + geom_point(alpha=0.3) + + geom_hline(yintercept=0, color="red") + + ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2))) +}) %$% gg + +#+ fig.height=10*ceiling(length(maPlots)/2) +multiplot(plotlist=maPlots, cols=min(2, length(maPlots))) + +# Error in eval(expr, envir, enclos) : object 'value_1' not found +##-------------------------------------------------------- + + + ## TODO Melanie compare with section "Differentially expressed genes" and complement missing bits