Merge branch 'master' of ssh://git-srv1/local/git/bioinformatics

8be8257b · Melanie Schneider · c8946c90 · d883b42a · 8be8257b · 8be8257b
Commit 8be8257b authored 9 years ago by Melanie Schneider
--- a/dge_workflow/dge_master_template.sh
+++ b/dge_workflow/dge_master_template.sh
@@ -120,6 +120,9 @@ fastqFiles=$(ls $baseDir/trimmed/*.fastq.gz)

 dge_tophat_se -i $igenome $fastqFiles 2>&1 | tee mapped.log

+## also create bigwig files
+# dge_bigwig ${igenome}/Sequence/Bowtie2Index/genome.fa.fai $(find . -name "*.bam" | grep -v unmapped |  xargs echo) &
+
 mailme "$project: mapping done"



--- a/dge_workflow/dge_utils.sh
+++ b/dge_workflow/dge_utils.sh
@@ -12,7 +12,6 @@ source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/s
 ## todo tweak this to work on bioinfo as well
 export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.2:$PATH
 export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
-export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
 export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
 export PATH=/sw/apps/python/current/bin:$PATH
 export PATH=/home/brandl/bin/deepTools/bin:$PATH
@@ -21,6 +20,9 @@ export PATH=/projects/bioinfo/holger/bin/bedtools-2.23.0/bin/:$PATH
 export PATH=/home/brandl/bin/spinr:$PATH
 export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH

+#export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
+export PATH=/home/brandl/bin/STAR/STAR_2.4.2a/source:$PATH
+


 # which tophat;  which bowtie2; which cuffdiff
@@ -201,6 +203,46 @@ mailme "$project: bamcorrelate done in $(pwd)"
 export -f dge_bam_correlate


+dge_bigwig(){
+
+usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"
+
+
+if [ $# -lt 2 ]; then
+    echo ${usage} >&2 ; return;
+fi
+
+#bamFiles=$(find . -name "*.bam" | grep -v unmapped |  xargs echo)
+genomeFai=$1
+bamFiles="${@:2}"
+
+if [ ! -f "${genomeFai}" ]; then
+    echo "could not find fai index $1! ${usage}" >&2 ; return;
+fi
+
+## add bigwig to PATH
+export PATH=~/bin:${PATH}
+
+if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
+    echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
+fi
+
+## create big wig files
+for bamFile in $bamFiles; do
+    sample=$(basename $bamFile .bam)
+    echo "converting $bamFile to bigwig format"
+    mysub "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile  -g ${genomeFai} |  wigToBigWig -clip stdin ${genomeFai} ${sample}.bw" -q short | joblist .bigwig
+done
+
+wait4jobs .bigwig
+
+}
+export -f dge_bigwig
+
+## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
+#type dge_bigwig
+
+
 ## Merge technical replicatews
 dge_merge_treps(){

@@ -356,3 +398,67 @@ mailme "$project: cuffdiff done in $(pwd)"

 }
 export -f dge_cuffdiff
+
+
+# Create a star index for a given igenome
+dge_create_star_index(){
+
+    if [ $# -ne 1 ]; then
+        echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
+    fi
+
+    igenome=$1
+
+    if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
+        echo "igenome directory '$igenome' does not exist" >&2 ; return;
+    fi
+
+    export star_index="${igenome}/Sequence/StarIndex"
+
+    ## stop if index exists already
+    if [ -d "$star_index" ]; then
+        echo "Error: Index directory ${star_index} already exists." >&2 ; return;
+    fi
+
+    chmod +w $(dirname ${star_index})
+
+    mailme "${project}: creating STAR index in ${star_index}"
+    mkdir ${star_index}
+
+    cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
+    #eval $cmd
+    #STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
+    mysub "${project}_star_index" "$cmd" -n 5 -R span[hosts=1] -q medium  | blockScript
+
+    ## prevent further modification
+    chmod -w $(dirname ${star_index})
+
+    mailme "created star index for $igenome"
+}
+export -f dge_create_star_index
+
+
+dge_star_counts2matrix(){
+echo '
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
+
+## STAR count file format is
+#column 1: gene ID
+#column 2: counts for unstranded RNA-seq
+#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
+
+exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){
+    read.delim(countFile, header=F) %>%
+        select(V1, V2) %>%
+        set_names("gene_id", "num_alignments") %>%
+        filter(!str_detect(gene_id, "^N_")) %>%
+        mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
+}, .progress="text")
+
+countMatrix <- spread(exprCounts, sample, num_alignments)
+
+write.delim(countMatrix, "star_count_matrix.txt")
+' | R --vanilla -q
+
+}
+export -f dge_star_counts2matrix
--- a/dge_workflow/star_align.sh
+++ b/dge_workflow/star_align.sh
@@ -23,12 +23,6 @@ eval "$(echo  "$usage" | ~/bin/docopts/docopts -h - : "$@")"

 #eval "$(echo "$usage" | ~/bin/docopts/docopts -h -  : /projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 /projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz fastq2)"

-for fastqFile in ${fastq_files[@]} ; do
-    echo processing $fastqFile
-done
-#
-#echo $igenome
-

 ## extract all configuration parameters
 fastqFiles=${fastq_files[@]}
@@ -52,18 +46,8 @@ fi

 ## build index if not present
 if [ ! -f "${star_index}/SA" ]; then
-chmod +w $(dirname ${star_index})
-
-mailme "${project}: creating STAR index for $igenome"
-mkdir ${star_index}
-
-cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
-#eval $cmd
-#STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
-mysub "${project}_star_index" "$cmd" -n 5 -R span[hosts=1] -q medium  | blockScript
-
-## prevent modification
-chmod -w $(dirname ${star_index})
+    echo "Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it"; exit 1;
+#    dge_create_star_index ${igenome}'
 fi


@@ -73,7 +57,7 @@ echo "running STAR using igenome '${igenome}' for the following files"
 #fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)

 for fastqFile in $fastqFiles ; do
-    echo "submitting tophat job for $fastqFile"
+    echo "submitting STAR job for $fastqFile"

    # DEBUG fastqFile=/projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz
    fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
@@ -82,15 +66,19 @@ for fastqFile in $fastqFiles ; do
    ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
    ###     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile

-    ## note --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
+    ## params:
+    # --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
+    # --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions
+    # --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
+
    mysub "${project}__star__${fastqBaseName}" "
-    STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile
+    STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts
    mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
    samtools index ${fastqBaseName}.bam
-    " -n 5 -R span[hosts=1] -q short | joblist .tophatjobs
+    " -n 5 -R span[hosts=1] -q medium | joblist .starjobs
 done

-wait4jobs .tophatjobs
+wait4jobs .starjobs


 rm -rf *STARgenome *.Log.progress.out _STARtmp *.SJ.out.tab *Log.out