Commit 6dc2c135 authored by Holger Brandl's avatar Holger Brandl

improved dge templates

parent c3958cf3
......@@ -25,10 +25,53 @@ To use the structure from above when working on bioinformatics-srv1 just use
/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/v1.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Get started with an RNASeq Project
==================================
1. use
/Volumes/projects/bioinfo/scripts/ngs\_tools/dev/misc/new\_project\_template.sh
to setup project directories
2. copy
/Volumes/projects/bioinfo/scripts/ngs\_tools/dev/dge\_workflow/dge\_master\_template.sh
to scripts directory
 
How to create a new version tag
-------------------------------
 
1. Create branch:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
git checkout -b v3.23
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Checkout branch into ngs\_tools
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cd /Volumes/projects/bioinfo/scripts/ngs_tools/
mkdir v3.23
git clone —> git checkout v3.23
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
ToDo
====
- mcdir sourced in?
- use STAR for mapping
- split cuffdiff report into individual tools
- Use featureCounts and DesSeq2 for differential expression analysisf
- Finish igv\_track\_range.sh
- Better peak report bioinfo\_templates/chipseq\_workflow/peak\_report.R
......
......@@ -45,54 +45,51 @@ mailme "$project: fastq download done"
dge_fastqc $(ls *fastq.gz) &
########################################################################################################################
### Apply renaming and merge lane replicates (but keep technical ones)
### Apply renaming and merge lane replicates (but keep technical ones)vvv
## todo adjust renaming scheme to project specifics
mcdir $baseDir/lanereps_pooled
echo '
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.10/R/core_commons.R")
options(java.parameters = "-Xmx4g" ); library(xlsx)
#sheetFile <- "/projects/bioinfo/holger/projects/stefania_isnm1/originals/stefaniat-FC_SN678_293-2014-11-26.xls"
sheetFile <- commandArgs(T)[1]
sheetFile <- "../originals/natalied-FC_SN678_338-2015-5-12.xls"
# nc: no culture
# na: no hormone
# ECD: ecdysone
# INS: insulin
## first number : biological repliciate
## last number: time
#str_replace_all(x, c("[ad]" = "!", "[cf]" = "?"))
#install.packages("stringr")
renaming_scheme=c("NC" = "no_culture", "NA" = "no_hormone", "ECD"="ecdysone_", "INS"="insulin_", "9"="9h", "4"="4h")
sampleSheet <- read.xlsx2(sheetFile, "Fastqfiles") %>%
select(File, SampleID, SampleName) %>%
## clean up the formatting a bit
mutate(
SampleName=str_replace(SampleName, "Ctrl14", "Ctrl_14"),
SampleName=ifelse(str_detect(SampleName, "_R"), SampleName, paste0(SampleName, "_R1")),
SampleName=str_replace(SampleName, "_14", "_"),
SampleName=str_replace(SampleName, "_R", "_TR")
) %>%
## also a add a column containing just biological replicate info (for bam merging later)
select(File, SampleName) %>%
mutate(
bio_sample=str_replace(SampleName, "_TR(1|2)", "")
bio_replicate=str_match(SampleName, "(.).*")[,2],
sample = str_replace(SampleName, "[0-9]*", "") %>% str_replace_all(renaming_scheme),
bio_sample=paste(sample, bio_replicate, sep="_")
)
write.delim(sampleSheet, file="renaming_scheme.txt")
require(ggplot2)
#ggplot(sampleSheet, aes(SampleName)) + geom_bar() + coord_flip()
## merge lane replication
#sampleSheet %>% count(bio_sample)
library(foreach)
library(doMC)
registerDoMC(cores=4)
#require(ggplot2)
#ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip()
## merge lane replication
#sampleSheet %>% group_by(SampleName) %>% do(with(., print(paste("zcat", paste(File, collapse=" "), "| gzip -c >", paste0(SampleName, ".fastq.gz")))))
## with parallelization
#sampleSheet %>% d_ply(.(SampleName), with,
# system(paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(SampleName, ".fastq.gz")))
#, .parallel=T)
## rather write file
sampleSheet %>% group_by(sample_name) %>% summarise(
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", trep_fastq_merged[1])
sampleSheet %>% group_by(bio_sample) %>% summarise(
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F)
' | R --vanilla -q --args $baseDir/originals/stefaniat-FC_SN678_293-2014-11-26.xls
' | R --vanilla -q
cat lane_merge.cmd | while read line; do
mysub "${project}__repmerge" "$line" | joblist .repmerge
......@@ -127,19 +124,19 @@ dge_tophat_se -i $igenome $fastqFiles 2>&1 | tee mapped.log
mailme "$project: mapping done"
########################################################################################################################
### Basic Alginment QC and technical replicate grouping
mcdir $baseDir/trep_pooled
bio_reps=<<<biological replicates labels>>>
## Examples
# bio_reps=$(csvcut -tc bio_sample ../lanereps_pooled/renaming_scheme.txt | tail -n+2 | sort -u | xargs echo | tr " " ",")
## bio_reps="ctrl,isnm1"
dge_merge_treps $baseDir/mapped/ $bio_reps
#########################################################################################################################
#### Basic Alginment QC and technical replicate grouping
#
#mcdir $baseDir/trep_pooled
#
#
#bio_reps=<<<biological replicates labels>>>
### Examples
## bio_reps=$(csvcut -tc bio_sample ../lanereps_pooled/renaming_scheme.txt | tail -n+2 | sort -u | xargs echo | tr " " ",")
### bio_reps="ctrl,isnm1"
#
#dge_merge_treps $baseDir/mapped/ $bio_reps
#
########################################################################################################################
### Do the differential expression analysis
......@@ -150,7 +147,7 @@ gtfFile=$igenome/Annotation/Genes/genes.gtf
head $gtfFile
## define labels to split bam files into replicate groups
labels=$(csvcut -t -c1 $baseDir/lanereps_pooled/sample2replicates.txt | tail -n+2 | uniq | xargs echo | sed 's/ /,/g')
labels=$(csvcut -t -c4 $baseDir/lanereps_pooled/renaming_scheme.txt | tail -n+2 |sort | uniq | xargs echo | sed 's/ /,/g')
## todo better externalize them
#labels=<<<<TBD>>>>
......
## allow group to modify results by default
umask u=rwx,g=rwx,o=
export PROJECT_NAME=dye_rnaseq
export PROJECT_DIR=bioinformatics/projects/${PROJECT_NAME}
#############################################
# setup project locally on project space
#############################################
mkdir /Volumes/${PROJECT_DIR}
cd /Volumes/${PROJECT_DIR}
## setup data (unison target)
mkdir data
## setup scripts
git init --bare .scripts_git_origin
git clone ssh://fileserver/projects/${PROJECT_DIR}/.scripts_git_origin scripts
## check out the working copy
git clone ssh://fileserver/projects/${PROJECT_DIR}/.scripts_git_origin scripts
#############################################
## setup project on cluster
#############################################
mm
## setup data (to be synced with unison later)
mkdir /projects/bioinfo/holger/projects/${PROJECT_NAME}
cd /projects/bioinfo/holger/projects/${PROJECT_NAME}
## setup scripts
git clone ssh://fileserver/projects/${PROJECT_DIR}/.scripts_git_origin /projects/bioinfo/holger/scripts/${PROJECT_NAME}
## configure which version of common tools to use
cd /projects/bioinfo/holger/scripts/${PROJECT_NAME}
ln -s /projects/bioinfo/scripts/ngs_tools/dev ngs_tools
## prepare gitignore to avoid that we commit it again
echo ngs_tools > .gitignore
git add .gitignore
git commit -m "started project"
git push origin master
## setup project on bioinfo
#bi
#mkdir ~/projects/${PROJECT_NAME}
#cd ~/projects/${PROJECT_NAME}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment