diff --git a/dge_workflow/star_align.kts b/dge_workflow/star_align.kts new file mode 100755 index 0000000000000000000000000000000000000000..dc741048498f1bcd204c189d826d52c3cf79944b --- /dev/null +++ b/dge_workflow/star_align.kts @@ -0,0 +1,125 @@ +#!/usr/bin/env kscript +//DEPS org.docopt:docopt:0.6.0-SNAPSHOT de.mpicbg.scicomp:joblist:0.6-SNAPSHOT de.mpicbg.scicomp.bioinfo:kutils:0.1-SNAPSHOT + +// add docopts to local m2 index +// git clone https://github.com/docopt/docopt.java && cd docopt.java && mvn clean package install -Dmaven.test.skip=true +// kotlinc-jvm -classpath $(mvncp org.docopt:docopt:0.6.0-SNAPSHOT) + +// update joblist +// cd ~/mnt/mack/bioinfo/holger/misc/Dropbox/cluster_sync/joblist/ + + +import joblist.JobConfiguration +import joblist.JobList +import kutils.bashEval +import org.docopt.Docopt +import java.io.File +import java.util.* + +//val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz") + + +// basic usage tutorial +// http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/ + + +// https://www.biostars.org/p/91020/ + +val usage = """ +Use star to align fastq files against a genome +Usage: star_align.sh [options] <igenome> <fastq_files>... + +Options: +--gtf <gtfFile> Custom gtf file instead of igenome bundled copy +--pc-only Use protein coding genes only for mapping and quantification +""" + + +val doArgs = Docopt(usage).parse(args.toList()).map { + it.key.removePrefix("--").replace("[<>]", "") to it.value?.toString() +}.toMap() + + +println(Objects.toString(1)) +println(Objects.nonNull(true)) + + +// extract all configuration parameters +val fastqFiles = (doArgs["fastq_files"] as String).split(" ").map { File(it) } +val igenome = File(doArgs["igenome"]) +val star_index = File(igenome, "Sequence/StarIndex") +val gtfFile = if (doArgs["gtfFile"] != null) File(doArgs["gtfFile"]) else File(igenome, "Annotation/Genes/genes.gtf") + +// check if gtf file exists +if (!gtfFile.isFile()) throw IllegalArgumentException("gtf '${gtfFile}' does not exist") + +// make sure that STAR is in the PATH +if (bashEval("which STAR").exitCode != 0) throw IllegalArgumentException("gtf '${gtfFile}' does not exist") + +// Check if STAR index is not present +if (!File("${star_index}/SA").isFile()) + throw IllegalArgumentException("Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it") +// dge_create_star_index ${igenome}' + +println("running STAR using igenome '${igenome}' for the following files") + +val jl = JobList(".starjobs") +//rm .starjobs + +//fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz) + +for (fastqFile in fastqFiles) { + println("submitting STAR job for $fastqFile") + + // uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464 + //# tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile + + // params: + // --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag) + // --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions + // --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J + // --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA + + val fastqBaseName = fastqFile.name.replace(".fastq.gz", "") + + val cmd = """ + STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3 + mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam + samtools index ${fastqBaseName}.bam + """.trimIndent() + + jl.run(JobConfiguration(cmd, "star__${fastqBaseName}", null, "medium", 5, "", better.files.File(File(".").toPath()))) +} + + +jl.waitUntilDone(1000) +//val requiresRerun: List<Job>? = jl.requiresRerun(). +//if (requiresRerun.istEmpty()) throw RuntimeException("jobs failed: \n" +jl.printStatus()) + + +// http://lampwww.epfl.ch/~michelou/scala/using-scala-from-java.html +joblist.`package$`.`MODULE$`.ImplJobListUtils(jl).createHtmlReport() + + +//// estimate expresssion with http://bioinf.wehi.edu.au/featureCounts/ +////Summarize multiple datasets at the same time: +//#featureCounts - t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam +// +//// FeatureCounts --> Commented out because now part of STAR +//#bamFile = control_3.bam +//#featureCounts - t exon -g gene_id -a ${ gtfFile } - o feature_counts.txt ${ bamFile } - T 5 +//#mysub "${project}__feature_counts" "featureCounts -t exon -g gene_id -a ${gtfFile} -o gene_counts.txt $(ls *bam) -T 5"-q medium | blockScript +// + + +// cleanup +bashEval("""rm - rf * STARgenome * .Log.progress.out _STARtmp * .SJ.out.tab *Log.out""") + +// Test for bam correlation (but don't wait for the results) +bashEval(""" +rend.R - e ${ System.getenv("NGS_TOOLS") } / dge_workflow / star_qc.R. +dge_bam_correlate & +""") + +// Condense counts into matrix +bashEval("""dge_star_counts2matrix""") diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..dd9e5972e57613c365360c433daefb18e72ca89b --- /dev/null +++ b/pom.xml @@ -0,0 +1,29 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <artifactId>ngs_tools</artifactId> + <groupId>de.mpicbg.scicomp.bioinfo</groupId> + <version>1.0-SNAPSHOT</version> + + + <dependencies> + <dependency> + <groupId>org.docopt</groupId> + <artifactId>docopt</artifactId> + <version>0.6.0-SNAPSHOT</version> + </dependency> + + <dependency> + <groupId>de.mpicbg.scicomp</groupId> + <artifactId>joblist_2.11</artifactId> + <version>0.6-SNAPSHOT</version> + </dependency> + + <dependency> + <groupId>de.mpicbg.scicomp.bioinfo</groupId> + <artifactId>kutils</artifactId> + <version>0.1-SNAPSHOT</version> + </dependency> + </dependencies> +</project> \ No newline at end of file