Skip to content
Snippets Groups Projects
Commit cb717d26 authored by Holger Brandl's avatar Holger Brandl
Browse files

continued new star workflow

parent 39bdbab8
No related branches found
No related tags found
No related merge requests found
......@@ -12,18 +12,20 @@ Usage: cp_enrichment.R [options] <gene_lists_tsv> <group_col>
Options:
--overlay_expr_data Tsv with overlay data for the kegg pathways
--list_id_col Column containing the grouping variable to speparate different lists [default: ]
--project <project_prefix> Name to prefix all generated result files [default: ]
'
opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# opts <- docopt(doc, "--overlay_expr_data ctrl_fc_expr_filtered.txt geneClusters.txt cluster" )
# opts <- docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" )
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/bio/diffex_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/bio/diffex_commons.R")
require.auto(knitr)
#require.auto(clusterProfiler)
#require.auto(ReactomePA)
require.auto(DT)
## to fix child support issue with knitr, see also
## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents
......@@ -47,11 +49,15 @@ if(!is.null(opts$overlay_expr_data)){
## TODO expose options to run gesa instead (by assuming that gene lists are sorted)
## see http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.pdf for details
resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
resultsBaseName <- if(str_length(opts$project)>0) paste0(opts$project, ".") else basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
#resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
########################################################################################################################
#' ## Enrichment Analysis
unloadNamespace('dplyr'); require(dplyr)
#' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose,
listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct
......@@ -305,7 +311,7 @@ l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFi
########################################################################################################################
#' ## Enriched KEGG Pathways
#+ eval=nrow(enrResults %>% filter(ontology=="kegg")) >0
#+ eval=(nrow(enrResults %>% filter(ontology=="kegg")) >0)
#' To understand spatio-temporal changes in gene expression better we now overlay enriched kegg pathways with the -log10(q_value) of each contrast. The direction of the expression changes is encoded as color, whereby red indicates that sample_1 is overexpressed. Because we have multiple contrasts of interest, this defines a slice-color barcode for each gene. To relate the barcode to contrasts we define the following slice order:
......@@ -318,8 +324,8 @@ keggPathways <- enrResults %>% filter(ontology=="kegg") %$% ac(ID) %>% unique()
#}
#+ echo=FALSE
require(pathview)
require(png)
require.auto(pathview)
require.auto(png)
# keggPathways <- keggPathways[1]
......
# todo define project name
export project=<<PROJECTNAME>>
# screen -R $project
## madmax
if [ -n "$LSF_SERVERDIR" ]; then
export baseDir="/projects/bioinfo/holger/projects/${project}"
export PROJECT_SCRIPTS="/projects/bioinfo/holger/scripts/${project}"
export NGS_TOOLS="/projects/bioinfo/scripts/ngs_tools/v1.1"
fi
## bioinfo
if [ $(hostname) == "bioinformatics-srv1" ]; then
#<<<todo define paths on bioinfo>>>
#export baseDir=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/data
#export PROJECT_SCRIPTS=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/scripts
export NGS_TOOLS="/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/v1.1"
fi
source ${NGS_TOOLS}/dge_workflow/dge_utils.sh
export PATH=${NGS_TOOLS}/dge_workflow:$PATH
## todo define igenome to be used
## igenome=/projects/bioinfo/igenomes/Canis_familiaris/Ensembl/CanFam3.1
igenome=<<<<TBD>>>>
########################################################################################################################
### Fetch the data
mcdir $baseDir/originals
wget -nc --user="USER" --password="PW" -r --no-directories --no-check-certificate -A "*fastq.gz" https:/projects.biotec.tu-dresden.de/ngs-filesharing/martaf/
mailme "$project: fastq download done"
### Basic QC
dge_fastqc $(ls *fastq.gz) &
## todo make sure to also copy the sample sheet in here
########################################################################################################################
### Apply renaming and merge lane replicates (but keep technical ones)
## todo adjust renaming scheme to project specifics
mcdir $baseDir/lanereps_pooled
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.10/R/core_commons.R")
options(java.parameters = "-Xmx4g" ); library(xlsx)
sheetFile <- "../originals/natalied-FC_SN678_338-2015-5-12.xls"
# nc: no culture
# na: no hormone
# ECD: ecdysone
# INS: insulin
## first number : biological repliciate
## last number: time
renaming_scheme=c("NC" = "no_culture", "NA" = "no_hormone", "ECD"="ecdysone_", "INS"="insulin_", "9"="9h", "4"="4h")
sampleSheet <- read.xlsx2(sheetFile, "Fastqfiles") %>%
select(File, SampleName) %>%
mutate(
bio_replicate=str_match(SampleName, "(.).*")[,2],
sample = str_replace(SampleName, "[0-9]*", "") %>% str_replace_all(renaming_scheme),
bio_sample=paste(sample, bio_replicate, sep="_")
)
write.delim(sampleSheet, file="renaming_scheme.txt")
#sampleSheet %>% count(bio_sample)
#require(ggplot2)
#ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip()
## merge lane replication
## rather write file
sampleSheet %>% group_by(bio_sample) %>% summarise(
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F)
' | R --vanilla -q
cat lane_merge.cmd | while read line; do
mysub "${project}__repmerge" "$line" | joblist .repmerge
done
wait4jobs .repmerge
mailme "$project: replicate merging done"
dge_fastqc $(ls *fastq.gz) &
########################################################################################################################
### Alignment the reads
mcdir $baseDir/alignments
star_align.sh $igenome $(ls $baseDir/lanereps_pooled/*.fastq.gz)
########################################################################################################################
### Differential Expression Analysis
mcdir $baseDir/dge_analysis
rend.R -e $NGS_TOOLS/dge_workflow/featcounts_deseq.R ../mapped/star_counts_matrix.txt 2>&1 | tee featcounts_deseq.log
## or use just some contrasts of interest
#echo "
#sample_1, sample_2
#unpolarised,liver_polar_stage3
#" | csvformat -T > contrasts.txt
#
#rend.R -e $NGS_TOOLS/dge_workflow/featcounts_deseq.R "\"--contrasts contrasts.txt ../alignments/star_counts_matrix.txt\"" 2>&1 | tee featcounts_deseq.log
## Term enrichment
mcdir $baseDir/dge_analysis/dge_enrichment_analysis
$NGS_TOOLS/common/cp_enrichment.R --overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast
rend.R -e $NGS_TOOLS/common/cp_enrichment.R "\"--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast\""
########################################################################################################################
### Sync back to project space
## bidirectional sync with project space
## todo define mount path on bioinfo for bidirectional synching
~/bin/unison $baseDir ssh://bioinfo///home/brandl/mnt/<<MOUNT_PATH>> -fastcheck true -times -perms 0
# uni-directional sync
#rsync -avsn --delete $baseDir brandl@fileserver:/projects//file/server/path
mailme "$project: sync done"
\ No newline at end of file
......@@ -118,6 +118,7 @@ stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% colData$condition
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
########################################################################################################################
......@@ -130,7 +131,6 @@ dds <- estimateSizeFactors(dds)
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation. So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#dds <- estimateDispersions(dds)
plotDispEsts(dds, main="Dispersion plot")
########################################################################################################################
......@@ -406,7 +406,7 @@ degs %>% transmute(ensembl_gene_id, contrast=paste(sample_1, "vs", sample_2)) %>
#' ### DEG Counts
#+ fig.height=nrow(contrasts)+2
#+ fig.height=nrow(contrasts)/2+2
#ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment