Commit 53b1a7a1 authored by Holger Brandl's avatar Holger Brandl

started pe-aligner wrapper

parent aec15c2a
#!/usr/bin/env kscript
//DEPS org.docopt:docopt:0.6.0-SNAPSHOT de.mpicbg.scicomp:joblist:0.6-SNAPSHOT de.mpicbg.scicomp.bioinfo:kutils:0.1-SNAPSHOT
// add docopts to local m2 index
// git clone https://github.com/docopt/docopt.java && cd docopt.java && mvn clean package install -Dmaven.test.skip=true
// kotlinc-jvm -classpath $(mvncp org.docopt:docopt:0.6.0-SNAPSHOT)
// update joblist
// cd ~/mnt/mack/bioinfo/holger/misc/Dropbox/cluster_sync/joblist/
import joblist.JobConfiguration
import joblist.JobList
import kutils.bashEval
import org.docopt.Docopt
import java.io.File
import java.util.*
//val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
// basic usage tutorial
// http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/
// https://www.biostars.org/p/91020/
val usage = """
Use star to align fastq files against a genome
Usage: star_align.sh [options] <igenome> <fastq_files>...
Options:
--gtf <gtfFile> Custom gtf file instead of igenome bundled copy
--pc-only Use protein coding genes only for mapping and quantification
"""
val doArgs = Docopt(usage).parse(args.toList()).map {
it.key.removePrefix("--").replace("[<>]", "") to it.value?.toString()
}.toMap()
println(Objects.toString(1))
println(Objects.nonNull(true))
// extract all configuration parameters
val fastqFiles = (doArgs["fastq_files"] as String).split(" ").map { File(it) }
val igenome = File(doArgs["igenome"])
val star_index = File(igenome, "Sequence/StarIndex")
val gtfFile = if (doArgs["gtfFile"] != null) File(doArgs["gtfFile"]) else File(igenome, "Annotation/Genes/genes.gtf")
// check if gtf file exists
if (!gtfFile.isFile()) throw IllegalArgumentException("gtf '${gtfFile}' does not exist")
// make sure that STAR is in the PATH
if (bashEval("which STAR").exitCode != 0) throw IllegalArgumentException("gtf '${gtfFile}' does not exist")
// Check if STAR index is not present
if (!File("${star_index}/SA").isFile())
throw IllegalArgumentException("Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it")
// dge_create_star_index ${igenome}'
println("running STAR using igenome '${igenome}' for the following files")
val jl = JobList(".starjobs")
//rm .starjobs
//fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for (fastqFile in fastqFiles) {
println("submitting STAR job for $fastqFile")
// uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
//# tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
// params:
// --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
// --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions
// --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
// --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA
val fastqBaseName = fastqFile.name.replace(".fastq.gz", "")
val cmd = """
STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3
mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
samtools index ${fastqBaseName}.bam
""".trimIndent()
jl.run(JobConfiguration(cmd, "star__${fastqBaseName}", null, "medium", 5, "", better.files.File(File(".").toPath())))
}
jl.waitUntilDone(1000)
//val requiresRerun: List<Job>? = jl.requiresRerun().
//if (requiresRerun.istEmpty()) throw RuntimeException("jobs failed: \n" +jl.printStatus())
// http://lampwww.epfl.ch/~michelou/scala/using-scala-from-java.html
joblist.`package$`.`MODULE$`.ImplJobListUtils(jl).createHtmlReport()
//// estimate expresssion with http://bioinf.wehi.edu.au/featureCounts/
////Summarize multiple datasets at the same time:
//#featureCounts - t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam
//
//// FeatureCounts --> Commented out because now part of STAR
//#bamFile = control_3.bam
//#featureCounts - t exon -g gene_id -a ${ gtfFile } - o feature_counts.txt ${ bamFile } - T 5
//#mysub "${project}__feature_counts" "featureCounts -t exon -g gene_id -a ${gtfFile} -o gene_counts.txt $(ls *bam) -T 5"-q medium | blockScript
//
// cleanup
bashEval("""rm - rf * STARgenome * .Log.progress.out _STARtmp * .SJ.out.tab *Log.out""")
// Test for bam correlation (but don't wait for the results)
bashEval("""
rend.R - e ${ System.getenv("NGS_TOOLS") } / dge_workflow / star_qc.R.
dge_bam_correlate &
""")
// Condense counts into matrix
bashEval("""dge_star_counts2matrix""")
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>ngs_tools</artifactId>
<groupId>de.mpicbg.scicomp.bioinfo</groupId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.docopt</groupId>
<artifactId>docopt</artifactId>
<version>0.6.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>de.mpicbg.scicomp</groupId>
<artifactId>joblist_2.11</artifactId>
<version>0.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>de.mpicbg.scicomp.bioinfo</groupId>
<artifactId>kutils</artifactId>
<version>0.1-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment