Something went wrong on our end
-
Holger Brandl authoredHolger Brandl authored
dge_master_template.sh 5.69 KiB
# todo define project name
export project=<<PROJECTNAME>>
# screen -R $project
## madmax
if [ -n "$LSF_SERVERDIR" ]; then
export baseDir="/projects/bioinfo/holger/projects/${project}"
export PROJECT_SCRIPTS="/projects/bioinfo/holger/scripts/${project}"
export NGS_TOOLS="/projects/bioinfo/scripts/ngs_tools/v1.1"
fi
## bioinfo
if [ $(hostname) == "bioinformatics-srv1" ]; then
#<<<todo define paths on bioinfo>>>
#export baseDir=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/data
#export PROJECT_SCRIPTS=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/scripts
export NGS_TOOLS="/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/v1.1"
fi
source ${NGS_TOOLS}/dge_workflow/dge_utils.sh
export PATH=${NGS_TOOLS}/dge_workflow:$PATH
## todo define igenome to be used
## igenome=/projects/bioinfo/igenomes/Canis_familiaris/Ensembl/CanFam3.1
igenome=<<<<TBD>>>>
########################################################################################################################
### Fetch the data
mcdir $baseDir/originals
wget -nc --user="USER" --password="PW" -r --no-directories --no-check-certificate -A "*fastq.gz" https:/projects.biotec.tu-dresden.de/ngs-filesharing/martaf/
mailme "$project: fastq download done"
## todo make sure to also copy the sample sheet in here
########################################################################################################################
### Basic QC
dge_fastqc $(ls *fastq.gz) &
########################################################################################################################
### Apply renaming and merge lane replicates (but keep technical ones)
## todo adjust renaming scheme to project specifics
mcdir $baseDir/lanereps_pooled
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.10/R/core_commons.R")
options(java.parameters = "-Xmx4g" ); library(xlsx)
sheetFile <- "../originals/natalied-FC_SN678_338-2015-5-12.xls"
# nc: no culture
# na: no hormone
# ECD: ecdysone
# INS: insulin
## first number : biological repliciate
## last number: time
renaming_scheme=c("NC" = "no_culture", "NA" = "no_hormone", "ECD"="ecdysone_", "INS"="insulin_", "9"="9h", "4"="4h")
sampleSheet <- read.xlsx2(sheetFile, "Fastqfiles") %>%
select(File, SampleName) %>%
mutate(
bio_replicate=str_match(SampleName, "(.).*")[,2],
sample = str_replace(SampleName, "[0-9]*", "") %>% str_replace_all(renaming_scheme),
bio_sample=paste(sample, bio_replicate, sep="_")
)
write.delim(sampleSheet, file="renaming_scheme.txt")
#sampleSheet %>% count(bio_sample)
#require(ggplot2)
#ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip()
## merge lane replication
## rather write file
sampleSheet %>% group_by(bio_sample) %>% summarise(
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F)
' | R --vanilla -q
cat lane_merge.cmd | while read line; do
mysub "${project}__repmerge" "$line" | joblist .repmerge
done
wait4jobs .repmerge
mailme "$project: replicate merging done"
dge_fastqc $(ls *fastq.gz) &
########################################################################################################################
### Trim low-quality bases and remove left-over adapters
mcdir $baseDir/trimmed
dge_cutadapt $(ls $baseDir/lanereps_pooled/*fastq.gz) 2>&1 | tee cutadapt.log
dge_fastqc $(ls *fastq.gz) &
########################################################################################################################
### Align the reads
mcdir $baseDir/mapped
fastqFiles=$(ls $baseDir/trimmed/*.fastq.gz)
dge_tophat_se -i $igenome $fastqFiles 2>&1 | tee mapped.log
## also create bigwig files
# dge_bigwig ${igenome}/Sequence/Bowtie2Index/genome.fa.fai $(find . -name "*.bam" | grep -v unmapped | xargs echo) &
mailme "$project: mapping done"
#########################################################################################################################
#### Technical replicate grouping
#
#mcdir $baseDir/trep_pooled
#
#
#bio_reps=<<<biological replicates labels>>>
### Examples
## bio_reps=$(csvcut -tc bio_sample ../lanereps_pooled/renaming_scheme.txt | tail -n+2 | sort -u | xargs echo | tr " " ",")
### bio_reps="ctrl,isnm1"
#
#dge_merge_treps $baseDir/mapped/ $bio_reps
########################################################################################################################
### Do the differential expression analysis
mcdir $baseDir/cuffdiff
gtfFile=$igenome/Annotation/Genes/genes.gtf
head $gtfFile
## define labels to split bam files into replicate groups
labels=$(csvcut -t -c4 $baseDir/lanereps_pooled/renaming_scheme.txt | tail -n+2 |sort | uniq | xargs echo | sed 's/ /,/g')
## todo better externalize them
#labels=<<<<TBD>>>>
dge_cuffdiff $gtfFile $baseDir/mapped $labels
mailme "$project: cuffdiff done"
mcdir $baseDir/cuffdiff/dge_report
spin.R ${NGS_TOOLS}/dge_workflow/cuffdiff_report.R ..
## or when using specfic contrasts
#echo "Ctrl_12h,X11B_12h
#Ctrl_36h,X11B_36h" > contrasts.txt
#spin.R $NGS_TOOLS/dge_workflow/cuffdiff_report.R \""--constrasts contrasts.txt --pcutoff 0.01 $baseDir/cuffdiff"\"
########################################################################################################################
### Sync back to project space
## bidirectional sync with project space
## todo define mount path on bioinfo for bidirectional synching
~/bin/unison $baseDir ssh://bioinfo///home/brandl/mnt/<<MOUNT_PATH>> -fastcheck true -times -perms 0
# uni-directional sync
#rsync -avsn --delete $baseDir brandl@fileserver:/projects//file/server/path
mailme "$project: sync done"