Skip to content
Snippets Groups Projects
Commit 8be8257b authored by Melanie Schneider's avatar Melanie Schneider
Browse files

Merge branch 'master' of ssh://git-srv1/local/git/bioinformatics

parents c8946c90 d883b42a
No related branches found
No related tags found
No related merge requests found
......@@ -120,6 +120,9 @@ fastqFiles=$(ls $baseDir/trimmed/*.fastq.gz)
dge_tophat_se -i $igenome $fastqFiles 2>&1 | tee mapped.log
## also create bigwig files
# dge_bigwig ${igenome}/Sequence/Bowtie2Index/genome.fa.fai $(find . -name "*.bam" | grep -v unmapped | xargs echo) &
mailme "$project: mapping done"
......
......@@ -12,7 +12,6 @@ source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/s
## todo tweak this to work on bioinfo as well
export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.2:$PATH
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
......@@ -21,6 +20,9 @@ export PATH=/projects/bioinfo/holger/bin/bedtools-2.23.0/bin/:$PATH
export PATH=/home/brandl/bin/spinr:$PATH
export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
#export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
export PATH=/home/brandl/bin/STAR/STAR_2.4.2a/source:$PATH
# which tophat; which bowtie2; which cuffdiff
......@@ -201,6 +203,46 @@ mailme "$project: bamcorrelate done in $(pwd)"
export -f dge_bam_correlate
dge_bigwig(){
usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"
if [ $# -lt 2 ]; then
echo ${usage} >&2 ; return;
fi
#bamFiles=$(find . -name "*.bam" | grep -v unmapped | xargs echo)
genomeFai=$1
bamFiles="${@:2}"
if [ ! -f "${genomeFai}" ]; then
echo "could not find fai index $1! ${usage}" >&2 ; return;
fi
## add bigwig to PATH
export PATH=~/bin:${PATH}
if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
fi
## create big wig files
for bamFile in $bamFiles; do
sample=$(basename $bamFile .bam)
echo "converting $bamFile to bigwig format"
mysub "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile -g ${genomeFai} | wigToBigWig -clip stdin ${genomeFai} ${sample}.bw" -q short | joblist .bigwig
done
wait4jobs .bigwig
}
export -f dge_bigwig
## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig
## Merge technical replicatews
dge_merge_treps(){
......@@ -356,3 +398,67 @@ mailme "$project: cuffdiff done in $(pwd)"
}
export -f dge_cuffdiff
# Create a star index for a given igenome
dge_create_star_index(){
if [ $# -ne 1 ]; then
echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
fi
igenome=$1
if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
echo "igenome directory '$igenome' does not exist" >&2 ; return;
fi
export star_index="${igenome}/Sequence/StarIndex"
## stop if index exists already
if [ -d "$star_index" ]; then
echo "Error: Index directory ${star_index} already exists." >&2 ; return;
fi
chmod +w $(dirname ${star_index})
mailme "${project}: creating STAR index in ${star_index}"
mkdir ${star_index}
cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
#eval $cmd
#STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
mysub "${project}_star_index" "$cmd" -n 5 -R span[hosts=1] -q medium | blockScript
## prevent further modification
chmod -w $(dirname ${star_index})
mailme "created star index for $igenome"
}
export -f dge_create_star_index
dge_star_counts2matrix(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
## STAR count file format is
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
select(V1, V2) %>%
set_names("gene_id", "num_alignments") %>%
filter(!str_detect(gene_id, "^N_")) %>%
mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
}, .progress="text")
countMatrix <- spread(exprCounts, sample, num_alignments)
write.delim(countMatrix, "star_count_matrix.txt")
' | R --vanilla -q
}
export -f dge_star_counts2matrix
......@@ -23,12 +23,6 @@ eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : "$@")"
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : /projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 /projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz fastq2)"
for fastqFile in ${fastq_files[@]} ; do
echo processing $fastqFile
done
#
#echo $igenome
## extract all configuration parameters
fastqFiles=${fastq_files[@]}
......@@ -52,18 +46,8 @@ fi
## build index if not present
if [ ! -f "${star_index}/SA" ]; then
chmod +w $(dirname ${star_index})
mailme "${project}: creating STAR index for $igenome"
mkdir ${star_index}
cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
#eval $cmd
#STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
mysub "${project}_star_index" "$cmd" -n 5 -R span[hosts=1] -q medium | blockScript
## prevent modification
chmod -w $(dirname ${star_index})
echo "Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it"; exit 1;
# dge_create_star_index ${igenome}'
fi
......@@ -73,7 +57,7 @@ echo "running STAR using igenome '${igenome}' for the following files"
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
echo "submitting STAR job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
......@@ -82,15 +66,19 @@ for fastqFile in $fastqFiles ; do
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
## note --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
## params:
# --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
# --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions
# --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
mysub "${project}__star__${fastqBaseName}" "
STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile
STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts
mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
samtools index ${fastqBaseName}.bam
" -n 5 -R span[hosts=1] -q short | joblist .tophatjobs
" -n 5 -R span[hosts=1] -q medium | joblist .starjobs
done
wait4jobs .tophatjobs
wait4jobs .starjobs
rm -rf *STARgenome *.Log.progress.out _STARtmp *.SJ.out.tab *Log.out
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment