star_align.kts 8.31 KB
Newer Older
1
#!/usr/bin/env kscript
2
//DEPS de.mpicbg.scicomp.joblist:joblist-kotlin:1.1 de.mpicbg.scicomp:kutils:0.7
3

4 5
// to create a development setup just run
// kscript --idea ${NGS_TOOLS}/dge_workflow/star_align.kts
6 7 8 9 10

// add docopts to local m2 index
// git clone https://github.com/docopt/docopt.java  && cd docopt.java && mvn clean package install  -Dmaven.test.skip=true
// kotlinc-jvm -classpath $(mvncp org.docopt:docopt:0.6.0-SNAPSHOT)

Holger Brandl's avatar
Holger Brandl committed
11
import de.mpicbg.scicomp.kutils.ShellUtils.mailme
Holger Brandl's avatar
Holger Brandl committed
12
import de.mpicbg.scicomp.kutils.evalBash
13 14
import de.mpicbg.scicomp.kutils.joblist.JobConfig
import de.mpicbg.scicomp.kutils.joblist.createHtmlReport
15 16 17
import joblist.JobList
import org.docopt.Docopt
import java.io.File
18
import java.nio.file.Files
19
import kotlin.system.exitProcess
20 21

//val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
22
//val args = listOf("--disable-pe-pairing", "/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
23
//val args = listOf("--star-params", "", "/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
24 25 26 27 28 29 30 31


// basic usage tutorial
// http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/
// https://www.biostars.org/p/91020/

val usage = """
Use star to align fastq files against a genome
32
Usage: star_align.kts [options] <igenome> <fastq_files>...
33 34

Options:
35 36 37
 --gtf <gtfFile>        Custom gtf file instead of igenome bundled copy
 --pc-only              Use protein coding genes only for mapping and quantification
 --disable-pe-pairing   Disable auto-detection and pairing of paired end tracks
38 39
"""

40
//--star-params <extras> Extra parameters passed on to STAR
41 42

val doArgs = Docopt(usage).parse(args.toList()).map {
Holger Brandl's avatar
Holger Brandl committed
43
    it.key.removePrefix("--").replace("[<>]".toRegex(), "") to it.value?.toString()
44 45
}.toMap()

46

Holger Brandl's avatar
Holger Brandl committed
47 48
//println(doArgs.keys.joinToString())
//println(doArgs.values.joinToString())
49 50

// extract all configuration parameters
51
val fastqFiles = (doArgs["fastq_files"] as String).substring(1).substringBefore("]").split(", ").map { File(it) }
52 53 54
val igenome = File(doArgs["igenome"])
val star_index = File(igenome, "Sequence/StarIndex")
val gtfFile = if (doArgs["gtfFile"] != null) File(doArgs["gtfFile"]) else File(igenome, "Annotation/Genes/genes.gtf")
55
val pePairingEnabled = !doArgs["disable-pe-pairing"]!!.toBoolean()
56

57 58 59 60
// does not work because of https://github.com/holgerbrandl/kscript/issues/98
//val extraStarParams = doArgs["star-params"] ?:
//    "--outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3"
val extraStarParams = System.getenv("STAR_PARAMS").takeIf { it!=null } ?:
61
    "--outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --outFilterMultimapNmax 1 --outReadsUnmapped Fastx --outSJfilterCountUniqueMin 8 3 3 3"
62 63 64



Holger Brandl's avatar
Holger Brandl committed
65
println("validating system requirements...")
66

67
// make sure that STAR and samtools are in the PATH
68
if (evalBash("which STAR").exitCode != 0) throw IllegalArgumentException("STAR aligner is not in PATH")
Holger Brandl's avatar
Holger Brandl committed
69
if (evalBash("which samtools").exitCode != 0) throw IllegalArgumentException("samtools are not in PATH")
Holger Brandl's avatar
Holger Brandl committed
70

Holger Brandl's avatar
Holger Brandl committed
71
System.getenv("NGS_TOOLS").let {
72 73 74
    require(it.isNotBlank() && File(it).isDirectory && File(it, "/dge_workflow/star_qc.R").isFile) {
        "NGS_TOOLS is not defined but needed to locate star_qc.R"
    }
Holger Brandl's avatar
Holger Brandl committed
75 76
}

77
require(evalBash("type dge_star_counts2matrix").exitCode == 0) { "dge_star_counts2matrix is missing" }
Holger Brandl's avatar
Holger Brandl committed
78 79

println("validating inputs...")
Holger Brandl's avatar
Holger Brandl committed
80 81 82

// check if gtf file exists
if (!gtfFile.isFile()) throw IllegalArgumentException("gtf '${gtfFile}' does not exist")
83 84 85 86 87 88

// Check if STAR index is not present
if (!File("${star_index}/SA").isFile())
    throw IllegalArgumentException("Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it")
//   dge_create_star_index ${igenome}'

Holger Brandl's avatar
Holger Brandl committed
89

90 91
//println("files are ${fastqFiles.joinToString(", ")}")
// make sure all fastq files do exist (for symlinks see http://www.java2s.com/Code/Java/JDK-7/Isfileasymboliclink.htm
Holger Brandl's avatar
Holger Brandl committed
92 93
//fastqFiles.filter { !(it.exists()  || Files.isSymbolicLink(it.toPath())) }.let {
fastqFiles.filter { !it.isFile }.let {
94 95 96
    require(it.isEmpty()) { "Some fastq files do not exist  ${it.map { it.absoluteFile }.joinToString(", ")}" }
}

97
require(fastqFiles.filter { it.name.contains("_R2.fastq") }.isEmpty()) {
98 99 100 101
    """Mapping 2nd mate alone is unlikely to have meaningful semantics.
     Just provide _1 and star_align.kts will pick up corresponding _2 for PE alignment if present"""
}

102
println("Running STAR using igenome '${igenome}' for the following files:\n ${fastqFiles.joinToString("\n")}")
103

Holger Brandl's avatar
Holger Brandl committed
104
val jl = JobList(".starjobs").apply { reset() }
105

106 107 108 109 110

fun detectReverseReads(fastqFile: File): File? {
    if (!fastqFile.name.contains("_R1.fastq")) return null

    val revReadsCandidate = fastqFile
111 112 113
        .absoluteFile
        .parentFile
        .resolve(fastqFile.name.replace("_R1.fastq", "_R2.fastq"))
114 115

    //    return revReadsCandidate.takeIf { it.exists() }
116
    return if (revReadsCandidate.exists()) revReadsCandidate else null
117 118 119
}


120 121 122
for (fastqFile in fastqFiles) {
    println("submitting STAR job for $fastqFile")

123 124
    var fastqBaseName = fastqFile.name.removeSuffix(".gz").removeSuffix(".fastq")

125 126 127 128 129 130 131 132 133
    // uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
    //#     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile

    // params:
    // --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
    // --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions
    // --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
    // --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA

134 135 136
    // detect if pe-pairing is enabled and reverse reads are supplied
    val revReads = if (pePairingEnabled) detectReverseReads(fastqFile)?.path ?: "" else ""
    if (revReads.isNotBlank()) fastqBaseName = fastqBaseName.removeSuffix("_R1")
137 138


139
    val optionalZcat = if (fastqFile.name.endsWith("gz")) "--readFilesCommand zcat" else ""
140

Holger Brandl's avatar
Holger Brandl committed
141 142 143
    // todo consider to use --outTmpDir which defaults to outFileNamePrefix STARtmp and which is deleted automatically
    // or use process substiutation in case of zipped reads (see https://github.com/alexdobin/STAR/issues/143#issuecomment-216597465)

144

145
    val cmd = """
146
    STAR --genomeDir ${star_index} --readFilesIn ${fastqFile} ${revReads} --runThreadN 6 ${optionalZcat} --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --sjdbGTFfile ${gtfFile} --quantMode GeneCounts ${extraStarParams}
147

148
    sleep 5 # to avoid filsystem issues on network volumnes
149 150 151 152
    mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
    samtools index ${fastqBaseName}.bam
    """.trimIndent()

153 154
    // slurm memory limit https://rc.fas.harvard.edu/resources/documentation/slurm-memory/
    // sacct -o MaxRSS -j JOBID
Holger Brandl's avatar
Holger Brandl committed
155
    jl.run(JobConfig(cmd, "star__${fastqBaseName}", "10:00", "", 7, 40000))
156 157 158 159 160 161 162 163 164
}


jl.waitUntilDone(1000)
//val requiresRerun: List<Job>? = jl.requiresRerun().
//if (requiresRerun.istEmpty()) throw RuntimeException("jobs failed: \n" +jl.printStatus())


// http://lampwww.epfl.ch/~michelou/scala/using-scala-from-java.html
165
jl.createHtmlReport()
166

167
if (!jl.isComplete) {
Holger Brandl's avatar
Holger Brandl committed
168
    mailme("star_align.kts failed in ${File(".").absolutePath}", jl.status().toString())
169
    exitProcess(1)
Holger Brandl's avatar
Holger Brandl committed
170
}
171 172 173 174 175 176 177 178 179 180 181 182 183

//// estimate expresssion with http://bioinf.wehi.edu.au/featureCounts/
////Summarize multiple datasets at the same time:
//#featureCounts - t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam
//
//// FeatureCounts --> Commented out because now part of STAR
//#bamFile = control_3.bam
//#featureCounts - t exon -g gene_id -a ${ gtfFile } - o feature_counts.txt ${ bamFile } - T 5
//#mysub "${project}__feature_counts" "featureCounts -t exon -g gene_id -a ${gtfFile} -o gene_counts.txt $(ls *bam) -T 5"-q medium | blockScript
//


// cleanup
184
evalBash("rm -rf *STARgenome *.Log.progress.out *_STARtmp *.SJ.out.tab *Log.out")
185

186

Holger Brandl's avatar
Holger Brandl committed
187
// Do reporting and perform bam correlation (but don't wait for the results)
188
evalBash("""
Holger Brandl's avatar
Holger Brandl committed
189
rend.R -e ${'$'}NGS_TOOLS/dge_workflow/star_qc.R .
190

Holger Brandl's avatar
Holger Brandl committed
191 192 193
# Condense counts into matrix
dge_star_counts2matrix
""")
194