#!/usr/bin/env kscript //DEPS com.offbytwo:docopt:0.6.0.20150202 de.mpicbg.scicomp:joblist:0.6-SNAPSHOT de.mpicbg.scicomp.bioinfo:kutils:0.1-SNAPSHOT // add docopts to local m2 index // git clone https://github.com/docopt/docopt.java && cd docopt.java && mvn clean package install -Dmaven.test.skip=true // kotlinc-jvm -classpath $(mvncp org.docopt:docopt:0.6.0-SNAPSHOT) import joblist.JobConfiguration import joblist.JobList import kutils.evalBash import org.docopt.Docopt import java.io.File //val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz") // basic usage tutorial // http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/ // https://www.biostars.org/p/91020/ val usage = """ Use star to align fastq files against a genome Usage: star_align.kts [options] <igenome> <fastq_files>... Options: --gtf <gtfFile> Custom gtf file instead of igenome bundled copy --pc-only Use protein coding genes only for mapping and quantification """ val doArgs = Docopt(usage).parse(args.toList()).map { it.key.removePrefix("--").replace("[<>]".toRegex(), "") to it.value?.toString() }.toMap() //println(doArgs.keys.joinToString()) //println(doArgs.values.joinToString()) //doArgs["fastq_files"] // extract all configuration parameters val fastqFiles = (doArgs["fastq_files"] as String).substring(1).substringBefore("]").split(", ").map { File(it) } val igenome = File(doArgs["igenome"]) val star_index = File(igenome, "Sequence/StarIndex") val gtfFile = if (doArgs["gtfFile"] != null) File(doArgs["gtfFile"]) else File(igenome, "Annotation/Genes/genes.gtf") println("validating system requirements...") // make sure that STAR is in the PATH if (evalBash("which STAR").exitCode != 0) throw IllegalArgumentException("STAR aligner is not in PATH") System.getenv("NGS_TOOLS").let { require(it.isNotBlank() && File(it).isDirectory) { "NGS_TOOLS is not defined but needed to locate star_qc.R" } } require(evalBash("type dge_star_counts2matrix").exitCode == 0) { "dge_star_counts2matrix is missing" } require(evalBash("type dge_bam_correlate").exitCode == 0) { "dge_bam_correlate is missing" } println("validating inputs...") // check if gtf file exists if (!gtfFile.isFile()) throw IllegalArgumentException("gtf '${gtfFile}' does not exist") // Check if STAR index is not present if (!File("${star_index}/SA").isFile()) throw IllegalArgumentException("Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it") // dge_create_star_index ${igenome}' //println("files are ${fastqFiles.joinToString(", ")}") // make sure all fastq files do exist (for symlinks see http://www.java2s.com/Code/Java/JDK-7/Isfileasymboliclink.htm //fastqFiles.filter { !(it.exists() || Files.isSymbolicLink(it.toPath())) }.let { fastqFiles.filter { !it.isFile }.let { require(it.isEmpty()) { "Some fastq files do not exist ${it.map { it.absoluteFile }.joinToString(", ")}" } } println("Running STAR using igenome '${igenome}' for the following files:\n ${fastqFiles.joinToString("\n")}") val jl = JobList(".starjobs") for (fastqFile in fastqFiles) { println("submitting STAR job for $fastqFile") // uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464 //# tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile // params: // --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag) // --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions // --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J // --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA val fastqBaseName = fastqFile.name.removeSuffix(".gz").removeSuffix(".fastq") val optionalZcat = if (fastqFile.name.endsWith("gz")) "--readFilesCommand zcat" else "" val cmd = """ STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 ${optionalZcat} --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3 mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam samtools index ${fastqBaseName}.bam """.trimIndent() // todo provide proper walltime here // slurm memory limit https://rc.fas.harvard.edu/resources/documentation/slurm-memory/ // sacct -o MaxRSS -j JOBID jl.run(JobConfiguration(cmd, "star__${fastqBaseName}", "5:00:00", "", 5, "--mem 40000", better.files.File(File(".").toPath()))) } jl.waitUntilDone(1000) //val requiresRerun: List<Job>? = jl.requiresRerun(). //if (requiresRerun.istEmpty()) throw RuntimeException("jobs failed: \n" +jl.printStatus()) // http://lampwww.epfl.ch/~michelou/scala/using-scala-from-java.html joblist.`package$`.`MODULE$`.ImplJobListUtils(jl).createHtmlReport() //// estimate expresssion with http://bioinf.wehi.edu.au/featureCounts/ ////Summarize multiple datasets at the same time: //#featureCounts - t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam // //// FeatureCounts --> Commented out because now part of STAR //#bamFile = control_3.bam //#featureCounts - t exon -g gene_id -a ${ gtfFile } - o feature_counts.txt ${ bamFile } - T 5 //#mysub "${project}__feature_counts" "featureCounts -t exon -g gene_id -a ${gtfFile} -o gene_counts.txt $(ls *bam) -T 5"-q medium | blockScript // // cleanup evalBash("rm -rf *STARgenome *.Log.progress.out _STARtmp *.SJ.out.tab *Log.out") // Do reporting and perform bam correlation (but don't wait for the results) evalBash(""" rend.R -e ${'$'}NGS_TOOLS/dge_workflow/star_qc.R . # Condense counts into matrix dge_star_counts2matrix # http://superuser.com/questions/178587/how-do-i-detach-a-process-from-terminal-entirely # disabled since it does not work for functions (see http://stackoverflow.com/questions/16435629/linux-shell-script-call-a-function-by-nohup #nohup dge_bam_correlate . & #dge_bam_correlate . & """)