continued new star workflow

cb717d26 · Holger Brandl · 39bdbab8 · cb717d26 · cb717d26 · cb717d26
Commit cb717d26 authored 9 years ago by Holger Brandl
--- a/common/cp_enrichment.R
+++ b/common/cp_enrichment.R
@@ -12,18 +12,20 @@ Usage: cp_enrichment.R [options] <gene_lists_tsv> <group_col>
 Options:
 --overlay_expr_data Tsv with overlay data for the kegg pathways
 --list_id_col Column containing the grouping variable to speparate different lists [default: ]
+--project <project_prefix>   Name to prefix all generated result files [default: ]
 '

 opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
-# opts <- docopt(doc, "--overlay_expr_data ctrl_fc_expr_filtered.txt geneClusters.txt cluster" )
+# opts <- docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" )

-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/ggplot_commons.R")
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/bio/diffex_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/ggplot_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/bio/diffex_commons.R")

 require.auto(knitr)
-#require.auto(clusterProfiler)
-#require.auto(ReactomePA)
+require.auto(DT)
+
+

 ## to fix child support issue with knitr, see also
 ## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents
@@ -47,11 +49,15 @@ if(!is.null(opts$overlay_expr_data)){
 ## TODO expose options to run gesa instead (by assuming that gene lists are sorted)
 ## see http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.pdf for details

-resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
+resultsBaseName <- if(str_length(opts$project)>0) paste0(opts$project, ".") else basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
+#resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")

 ########################################################################################################################
 #' ## Enrichment Analysis

+unloadNamespace('dplyr'); require(dplyr)
+
+
 #' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose,

 listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct
@@ -305,7 +311,7 @@ l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFi

 ########################################################################################################################
 #' ## Enriched KEGG Pathways
-#+ eval=nrow(enrResults %>% filter(ontology=="kegg")) >0
+#+ eval=(nrow(enrResults %>% filter(ontology=="kegg")) >0)

 #' To understand spatio-temporal changes in gene expression better we now overlay enriched kegg pathways with the -log10(q_value) of each contrast. The direction of the expression changes is encoded as color, whereby red indicates that sample_1 is overexpressed. Because we have multiple contrasts of interest, this defines a slice-color barcode for each gene. To relate the barcode to contrasts we define the following slice order:

@@ -318,8 +324,8 @@ keggPathways <- enrResults %>% filter(ontology=="kegg") %$% ac(ID) %>% unique()
 #}

 #+ echo=FALSE
-require(pathview)
-require(png)
+require.auto(pathview)
+require.auto(png)

 # keggPathways <- keggPathways[1]


--- a/dge_workflow/dge_star_template.sh
+++ b/dge_workflow/dge_star_template.sh
+# todo define project name
+export project=<<PROJECTNAME>>
+# screen -R $project
+
+
+## madmax
+if [ -n "$LSF_SERVERDIR" ]; then
+    export baseDir="/projects/bioinfo/holger/projects/${project}"
+    export PROJECT_SCRIPTS="/projects/bioinfo/holger/scripts/${project}"
+    export NGS_TOOLS="/projects/bioinfo/scripts/ngs_tools/v1.1"
+fi
+
+## bioinfo
+if [ $(hostname) == "bioinformatics-srv1" ]; then
+    #<<<todo define paths on bioinfo>>>
+    #export baseDir=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/data
+    #export PROJECT_SCRIPTS=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/scripts
+    export NGS_TOOLS="/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/v1.1"
+fi
+
+
+source ${NGS_TOOLS}/dge_workflow/dge_utils.sh
+export PATH=${NGS_TOOLS}/dge_workflow:$PATH
+
+
+## todo define igenome to be used
+## igenome=/projects/bioinfo/igenomes/Canis_familiaris/Ensembl/CanFam3.1
+igenome=<<<<TBD>>>>
+
+########################################################################################################################
+### Fetch the data
+
+mcdir $baseDir/originals
+
+wget -nc --user="USER" --password="PW"  -r --no-directories  --no-check-certificate -A "*fastq.gz" https:/projects.biotec.tu-dresden.de/ngs-filesharing/martaf/
+
+mailme "$project: fastq download done"
+
+### Basic QC
+dge_fastqc $(ls *fastq.gz) &
+
+## todo make sure to also copy the sample sheet in here
+
+########################################################################################################################
+### Apply renaming and merge lane replicates (but keep technical ones)
+
+## todo adjust renaming scheme to project specifics
+
+mcdir $baseDir/lanereps_pooled
+
+echo '
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.10/R/core_commons.R")
+
+options(java.parameters = "-Xmx4g" ); library(xlsx)
+
+sheetFile <- "../originals/natalied-FC_SN678_338-2015-5-12.xls"
+
+# nc: no culture
+# na: no hormone
+# ECD: ecdysone
+# INS: insulin
+## first number : biological repliciate
+## last number: time
+
+renaming_scheme=c("NC" = "no_culture", "NA" = "no_hormone", "ECD"="ecdysone_", "INS"="insulin_", "9"="9h", "4"="4h")
+
+sampleSheet <- read.xlsx2(sheetFile, "Fastqfiles") %>%
+    select(File, SampleName) %>%
+    mutate(
+        bio_replicate=str_match(SampleName, "(.).*")[,2],
+        sample = str_replace(SampleName, "[0-9]*", "") %>% str_replace_all(renaming_scheme),
+        bio_sample=paste(sample, bio_replicate, sep="_")
+    )
+
+write.delim(sampleSheet, file="renaming_scheme.txt")
+
+#sampleSheet %>% count(bio_sample)
+
+#require(ggplot2)
+#ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip()
+## merge lane replication
+
+
+## rather write file
+sampleSheet %>% group_by(bio_sample) %>% summarise(
+    zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
+) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F)
+' | R --vanilla -q
+
+cat lane_merge.cmd | while read line; do
+    mysub "${project}__repmerge" "$line" | joblist .repmerge
+done
+
+
+wait4jobs .repmerge
+mailme "$project: replicate merging done"
+
+dge_fastqc $(ls *fastq.gz) &
+
+
+########################################################################################################################
+### Alignment the reads
+
+mcdir $baseDir/alignments
+
+star_align.sh $igenome $(ls $baseDir/lanereps_pooled/*.fastq.gz)
+
+
+########################################################################################################################
+### Differential Expression Analysis
+
+
+mcdir $baseDir/dge_analysis
+
+rend.R -e $NGS_TOOLS/dge_workflow/featcounts_deseq.R ../mapped/star_counts_matrix.txt  2>&1  | tee featcounts_deseq.log
+
+
+## or use just some contrasts of interest
+#echo "
+#sample_1, sample_2
+#unpolarised,liver_polar_stage3
+#" | csvformat -T  > contrasts.txt
+#
+#rend.R -e $NGS_TOOLS/dge_workflow/featcounts_deseq.R "\"--contrasts contrasts.txt ../alignments/star_counts_matrix.txt\""  2>&1  | tee featcounts_deseq.log
+
+
+## Term enrichment
+mcdir $baseDir/dge_analysis/dge_enrichment_analysis
+$NGS_TOOLS/common/cp_enrichment.R --overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast
+rend.R -e $NGS_TOOLS/common/cp_enrichment.R "\"--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast\""
+
+
+########################################################################################################################
+### Sync back to project space
+
+## bidirectional sync with project space
+## todo define mount path on bioinfo for bidirectional synching
+~/bin/unison $baseDir ssh://bioinfo///home/brandl/mnt/<<MOUNT_PATH>> -fastcheck true -times -perms 0
+
+# uni-directional sync
+#rsync -avsn --delete  $baseDir brandl@fileserver:/projects//file/server/path
+mailme "$project: sync done"
\ No newline at end of file
--- a/dge_workflow/featcounts_deseq.R
+++ b/dge_workflow/featcounts_deseq.R
@@ -118,6 +118,7 @@ stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% colData$condition

 dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
 dds <- estimateSizeFactors(dds)
+dds <- estimateDispersions(dds)


 ########################################################################################################################
@@ -130,7 +131,6 @@ dds <- estimateSizeFactors(dds)
 #' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
 #' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
 ## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
-#dds <- estimateDispersions(dds)
 plotDispEsts(dds, main="Dispersion plot")

 ########################################################################################################################
@@ -406,7 +406,7 @@ degs %>% transmute(ensembl_gene_id, contrast=paste(sample_1, "vs", sample_2)) %>

 #' ### DEG Counts

-#+ fig.height=nrow(contrasts)+2
+#+ fig.height=nrow(contrasts)/2+2
 #ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
 ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()