star_align.kts 7.66 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env kscript
2
//DEPS de.mpicbg.scicomp.joblist:joblist-kotlin:1.1 de.mpicbg.scicomp:kutils:0.7
Holger Brandl's avatar
Holger Brandl committed
3

Holger Brandl's avatar
Holger Brandl committed
4
5
6
7
8

// add docopts to local m2 index
// git clone https://github.com/docopt/docopt.java  && cd docopt.java && mvn clean package install  -Dmaven.test.skip=true
// kotlinc-jvm -classpath $(mvncp org.docopt:docopt:0.6.0-SNAPSHOT)

Holger Brandl's avatar
Holger Brandl committed
9
import de.mpicbg.scicomp.kutils.evalBash
10
11
import de.mpicbg.scicomp.kutils.joblist.JobConfig
import de.mpicbg.scicomp.kutils.joblist.createHtmlReport
Holger Brandl's avatar
Holger Brandl committed
12
13
14
import joblist.JobList
import org.docopt.Docopt
import java.io.File
15
import java.nio.file.Files
Holger Brandl's avatar
Holger Brandl committed
16
17

//val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
Holger Brandl's avatar
Holger Brandl committed
18
//val args = listOf("--disable-pe-pairing", "/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
Holger Brandl's avatar
Holger Brandl committed
19
20
21
22
23
24
25
26


// basic usage tutorial
// http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/
// https://www.biostars.org/p/91020/

val usage = """
Use star to align fastq files against a genome
27
Usage: star_align.kts [options] <igenome> <fastq_files>...
Holger Brandl's avatar
Holger Brandl committed
28
29

Options:
Holger Brandl's avatar
Holger Brandl committed
30
31
32
 --gtf <gtfFile>        Custom gtf file instead of igenome bundled copy
 --pc-only              Use protein coding genes only for mapping and quantification
 --disable-pe-pairing   Disable auto-detection and pairing of paired end tracks
Holger Brandl's avatar
Holger Brandl committed
33
34
35
36
"""


val doArgs = Docopt(usage).parse(args.toList()).map {
Holger Brandl's avatar
Holger Brandl committed
37
    it.key.removePrefix("--").replace("[<>]".toRegex(), "") to it.value?.toString()
Holger Brandl's avatar
Holger Brandl committed
38
39
}.toMap()

Holger Brandl's avatar
Holger Brandl committed
40

Holger Brandl's avatar
Holger Brandl committed
41
42
//println(doArgs.keys.joinToString())
//println(doArgs.values.joinToString())
Holger Brandl's avatar
Holger Brandl committed
43
44

// extract all configuration parameters
Holger Brandl's avatar
Holger Brandl committed
45
val fastqFiles = (doArgs["fastq_files"] as String).substring(1).substringBefore("]").split(", ").map { File(it) }
Holger Brandl's avatar
Holger Brandl committed
46
47
48
val igenome = File(doArgs["igenome"])
val star_index = File(igenome, "Sequence/StarIndex")
val gtfFile = if (doArgs["gtfFile"] != null) File(doArgs["gtfFile"]) else File(igenome, "Annotation/Genes/genes.gtf")
Holger Brandl's avatar
Holger Brandl committed
49
val pePairingEnabled = !doArgs["disable-pe-pairing"]!!.toBoolean()
Holger Brandl's avatar
Holger Brandl committed
50

Holger Brandl's avatar
Holger Brandl committed
51
println("validating system requirements...")
Holger Brandl's avatar
Holger Brandl committed
52
53

// make sure that STAR is in the PATH
54
if (evalBash("which STAR").exitCode != 0) throw IllegalArgumentException("STAR aligner is not in PATH")
Holger Brandl's avatar
Holger Brandl committed
55

Holger Brandl's avatar
Holger Brandl committed
56
57
58
System.getenv("NGS_TOOLS").let {
    require(it.isNotBlank() && File(it).isDirectory) { "NGS_TOOLS is not defined but needed to locate star_qc.R" }
}
59
60
require(evalBash("type dge_star_counts2matrix").exitCode == 0) { "dge_star_counts2matrix is missing" }
require(evalBash("type dge_bam_correlate").exitCode == 0) { "dge_bam_correlate is missing" }
Holger Brandl's avatar
Holger Brandl committed
61
62
63


println("validating inputs...")
Holger Brandl's avatar
Holger Brandl committed
64
65
66

// check if gtf file exists
if (!gtfFile.isFile()) throw IllegalArgumentException("gtf '${gtfFile}' does not exist")
Holger Brandl's avatar
Holger Brandl committed
67
68
69
70
71
72

// Check if STAR index is not present
if (!File("${star_index}/SA").isFile())
    throw IllegalArgumentException("Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it")
//   dge_create_star_index ${igenome}'

Holger Brandl's avatar
Holger Brandl committed
73

Holger Brandl's avatar
Holger Brandl committed
74
75
//println("files are ${fastqFiles.joinToString(", ")}")
// make sure all fastq files do exist (for symlinks see http://www.java2s.com/Code/Java/JDK-7/Isfileasymboliclink.htm
76
77
//fastqFiles.filter { !it.isFile }.let {
fastqFiles.filter { !(it.exists()  || Files.isSymbolicLink(it.toPath())) }.let {
78
79
80
    require(it.isEmpty()) { "Some fastq files do not exist  ${it.map { it.absoluteFile }.joinToString(", ")}" }
}

81
require(fastqFiles.filter { it.name.contains("_R2.fastq") }.isEmpty()) {
82
83
84
85
    """Mapping 2nd mate alone is unlikely to have meaningful semantics.
     Just provide _1 and star_align.kts will pick up corresponding _2 for PE alignment if present"""
}

86
println("Running STAR using igenome '${igenome}' for the following files:\n ${fastqFiles.joinToString("\n")}")
Holger Brandl's avatar
Holger Brandl committed
87

Holger Brandl's avatar
Holger Brandl committed
88
val jl = JobList(".starjobs").apply { reset() }
Holger Brandl's avatar
Holger Brandl committed
89

Holger Brandl's avatar
Holger Brandl committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103

fun detectReverseReads(fastqFile: File): File? {
    if (!fastqFile.name.contains("_R1.fastq")) return null

    val revReadsCandidate = fastqFile
            .absoluteFile
            .parentFile
            .resolve(fastqFile.name.replace("_R1.fastq", "_R2.fastq"))

    //    return revReadsCandidate.takeIf { it.exists() }
    return if(revReadsCandidate.exists())revReadsCandidate else null
}


Holger Brandl's avatar
Holger Brandl committed
104
105
106
for (fastqFile in fastqFiles) {
    println("submitting STAR job for $fastqFile")

Holger Brandl's avatar
Holger Brandl committed
107
108
    var fastqBaseName = fastqFile.name.removeSuffix(".gz").removeSuffix(".fastq")

Holger Brandl's avatar
Holger Brandl committed
109
110
111
112
113
114
115
116
117
    // uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
    //#     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile

    // params:
    // --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
    // --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions
    // --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
    // --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA

Holger Brandl's avatar
Holger Brandl committed
118
119
120
    // detect if pe-pairing is enabled and reverse reads are supplied
    val revReads = if (pePairingEnabled) detectReverseReads(fastqFile)?.path ?: "" else ""
    if (revReads.isNotBlank()) fastqBaseName = fastqBaseName.removeSuffix("_R1")
121
122


123
    val optionalZcat = if (fastqFile.name.endsWith("gz")) "--readFilesCommand zcat" else ""
Holger Brandl's avatar
Holger Brandl committed
124

Holger Brandl's avatar
Holger Brandl committed
125
126
127
    // todo consider to use --outTmpDir which defaults to outFileNamePrefix STARtmp and which is deleted automatically
    // or use process substiutation in case of zipped reads (see https://github.com/alexdobin/STAR/issues/143#issuecomment-216597465)

128

Holger Brandl's avatar
Holger Brandl committed
129
    val cmd = """
130
131
    STAR --genomeDir ${star_index} --readFilesIn ${fastqFile} ${revReads} --runThreadN 6 ${optionalZcat} --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile ${gtfFile} --outFilterIntronMotifs   RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3

Holger Brandl's avatar
Holger Brandl committed
132
133
134
135
    mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
    samtools index ${fastqBaseName}.bam
    """.trimIndent()

Holger Brandl's avatar
Holger Brandl committed
136
137
    // slurm memory limit https://rc.fas.harvard.edu/resources/documentation/slurm-memory/
    // sacct -o MaxRSS -j JOBID
138
    jl.run(JobConfig(cmd, "star__${fastqBaseName}", "10:00", "", 5, 40000))
Holger Brandl's avatar
Holger Brandl committed
139
140
141
142
143
144
145
146
147
}


jl.waitUntilDone(1000)
//val requiresRerun: List<Job>? = jl.requiresRerun().
//if (requiresRerun.istEmpty()) throw RuntimeException("jobs failed: \n" +jl.printStatus())


// http://lampwww.epfl.ch/~michelou/scala/using-scala-from-java.html
148
jl.createHtmlReport()
Holger Brandl's avatar
Holger Brandl committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162


//// estimate expresssion with http://bioinf.wehi.edu.au/featureCounts/
////Summarize multiple datasets at the same time:
//#featureCounts - t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam
//
//// FeatureCounts --> Commented out because now part of STAR
//#bamFile = control_3.bam
//#featureCounts - t exon -g gene_id -a ${ gtfFile } - o feature_counts.txt ${ bamFile } - T 5
//#mysub "${project}__feature_counts" "featureCounts -t exon -g gene_id -a ${gtfFile} -o gene_counts.txt $(ls *bam) -T 5"-q medium | blockScript
//


// cleanup
Holger Brandl's avatar
Holger Brandl committed
163
evalBash("rm -rf *STARgenome *.Log.progress.out _STARtmp *.SJ.out.tab *Log.out")
164

Holger Brandl's avatar
Holger Brandl committed
165

Holger Brandl's avatar
Holger Brandl committed
166
// Do reporting and perform bam correlation (but don't wait for the results)
167
evalBash("""
Holger Brandl's avatar
Holger Brandl committed
168
rend.R -e ${'$'}NGS_TOOLS/dge_workflow/star_qc.R .
Holger Brandl's avatar
Holger Brandl committed
169

Holger Brandl's avatar
Holger Brandl committed
170
171
172
173
# Condense counts into matrix
dge_star_counts2matrix

# http://superuser.com/questions/178587/how-do-i-detach-a-process-from-terminal-entirely
Holger Brandl's avatar
Holger Brandl committed
174
175
# disabled since it does not work for functions (see http://stackoverflow.com/questions/16435629/linux-shell-script-call-a-function-by-nohup
#nohup dge_bam_correlate . &
176
177
178
## use jl instead to run via scheduler

## disabled because deeptools is not yet installed
Holger Brandl's avatar
Holger Brandl committed
179
dge_bam_correlate `ls *.bam`
180

Holger Brandl's avatar
Holger Brandl committed
181
#dge_bam_correlate . &
Holger Brandl's avatar
Holger Brandl committed
182
""")
Holger Brandl's avatar
Holger Brandl committed
183