# todo define project name export project=<<PROJECTNAME>> # screen -R $project ## madmax if [ -n "$LSF_SERVERDIR" ]; then export baseDir="/projects/bioinfo/holger/projects/${project}" export PROJECT_SCRIPTS="/projects/bioinfo/holger/scripts/${project}" export NGS_TOOLS="/projects/bioinfo/scripts/ngs_tools/v1.1" fi ## bioinfo if [ $(hostname) == "bioinformatics-srv1" ]; then #<<<todo define paths on bioinfo>>> #export baseDir=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/data #export PROJECT_SCRIPTS=/home/brandl/mnt/chip-seq_study/ChIPSeq_March_2015/scripts export NGS_TOOLS="/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/v1.1" fi source ${NGS_TOOLS}/dge_workflow/dge_utils.sh export PATH=${NGS_TOOLS}/dge_workflow:$PATH ## todo define igenome to be used ## igenome=/projects/bioinfo/igenomes/Canis_familiaris/Ensembl/CanFam3.1 igenome=<<<<TBD>>>> ######################################################################################################################## ### Fetch the data mcdir $baseDir/originals wget -nc --user="USER" --password="PW" -r --no-directories --no-check-certificate -A "*fastq.gz" https:/projects.biotec.tu-dresden.de/ngs-filesharing/martaf/ mailme "$project: fastq download done" ## todo make sure to also copy the sample sheet in here ######################################################################################################################## ### Basic QC dge_fastqc $(ls *fastq.gz) & ######################################################################################################################## ### Apply renaming and merge lane replicates (but keep technical ones) ## todo adjust renaming scheme to project specifics mcdir $baseDir/lanereps_pooled echo ' devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.10/R/core_commons.R") options(java.parameters = "-Xmx4g" ); library(xlsx) sheetFile <- "../originals/natalied-FC_SN678_338-2015-5-12.xls" # nc: no culture # na: no hormone # ECD: ecdysone # INS: insulin ## first number : biological repliciate ## last number: time renaming_scheme=c("NC" = "no_culture", "NA" = "no_hormone", "ECD"="ecdysone_", "INS"="insulin_", "9"="9h", "4"="4h") sampleSheet <- read.xlsx2(sheetFile, "Fastqfiles") %>% select(File, SampleName) %>% mutate( bio_replicate=str_match(SampleName, "(.).*")[,2], sample = str_replace(SampleName, "[0-9]*", "") %>% str_replace_all(renaming_scheme), bio_sample=paste(sample, bio_replicate, sep="_") ) write.delim(sampleSheet, file="renaming_scheme.txt") #sampleSheet %>% count(bio_sample) #require(ggplot2) #ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip() ## merge lane replication ## rather write file sampleSheet %>% group_by(bio_sample) %>% summarise( zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz")) ) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F) ' | R --vanilla -q cat lane_merge.cmd | while read line; do mysub "${project}__repmerge" "$line" | joblist .repmerge done wait4jobs .repmerge mailme "$project: replicate merging done" dge_fastqc $(ls *fastq.gz) & ######################################################################################################################## ### Trim low-quality bases and remove left-over adapters mcdir $baseDir/trimmed dge_cutadapt $(ls $baseDir/lanereps_pooled/*fastq.gz) 2>&1 | tee cutadapt.log dge_fastqc $(ls *fastq.gz) & ######################################################################################################################## ### Align the reads mcdir $baseDir/mapped fastqFiles=$(ls $baseDir/trimmed/*.fastq.gz) dge_tophat_se -i $igenome $fastqFiles 2>&1 | tee mapped.log ## also create bigwig files # dge_bigwig ${igenome}/Sequence/Bowtie2Index/genome.fa.fai $(find . -name "*.bam" | grep -v unmapped | xargs echo) & mailme "$project: mapping done" ######################################################################################################################### #### Technical replicate grouping # #mcdir $baseDir/trep_pooled # # #bio_reps=<<<biological replicates labels>>> ### Examples ## bio_reps=$(csvcut -tc bio_sample ../lanereps_pooled/renaming_scheme.txt | tail -n+2 | sort -u | xargs echo | tr " " ",") ### bio_reps="ctrl,isnm1" # #dge_merge_treps $baseDir/mapped/ $bio_reps ######################################################################################################################## ### Do the differential expression analysis mcdir $baseDir/cuffdiff gtfFile=$igenome/Annotation/Genes/genes.gtf head $gtfFile ## define labels to split bam files into replicate groups labels=$(csvcut -t -c4 $baseDir/lanereps_pooled/renaming_scheme.txt | tail -n+2 |sort | uniq | xargs echo | sed 's/ /,/g') ## todo better externalize them #labels=<<<<TBD>>>> dge_cuffdiff $gtfFile $baseDir/mapped $labels mailme "$project: cuffdiff done" mcdir $baseDir/cuffdiff/dge_report spin.R ${NGS_TOOLS}/dge_workflow/cuffdiff_report.R .. ## or when using specfic contrasts #echo "Ctrl_12h,X11B_12h #Ctrl_36h,X11B_36h" > contrasts.txt #spin.R $NGS_TOOLS/dge_workflow/cuffdiff_report.R \""--constrasts contrasts.txt --pcutoff 0.01 $baseDir/cuffdiff"\" ######################################################################################################################## ### Sync back to project space ## bidirectional sync with project space ## todo define mount path on bioinfo for bidirectional synching ~/bin/unison $baseDir ssh://bioinfo///home/brandl/mnt/<<MOUNT_PATH>> -fastcheck true -times -perms 0 # uni-directional sync #rsync -avsn --delete $baseDir brandl@fileserver:/projects//file/server/path mailme "$project: sync done"