bioinfo_utils.sh 7.99 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1

2
## Tophat Mapping Report from the logs
Holger Brandl's avatar
Holger Brandl committed
3 4
TophatMappingReport(){
echo '
5 6
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/ggplot_commons.R")
7 8 9


parseAlgnSummary_T2_0_11 <- function(alignSummary){
10
    #alignSummary="/projects/bioinfo/brandl/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
11 12 13 14 15
    algnData <- readLines(alignSummary)

    data.frame(
        condition=basename(dirname(alignSummary)),
        num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
Holger Brandl's avatar
Holger Brandl committed
16 17
        mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
    ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
}


algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11, .progress="text")
write.delim(algnSummary, file="tophat_mapping_stats.txt")

scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }


projectName=basename(dirname(getwd()))

devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/mdreport/master/R/mdreport-package.r")

md_new(paste("Mapping Summary for ", projectName))


md_plot(ggplot(algnSummary, aes(condition, mapping_efficiency)) + geom_bar(stat="identity") +coord_flip() + ylim(0,100) + ggtitle("mapping efficiency"))

md_plot(ggplot(algnSummary, aes(condition, num_reads)) + geom_bar(stat="identity") + coord_flip() + ggtitle("read counts") +scale_y_continuous(labels=comma))

md_plot(ggplot(algnSummary, aes(condition, mapped_reads)) + geom_bar(stat="identity") + coord_flip() + ggtitle("alignments counts") +scale_y_continuous(labels=comma))


#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")


md_report("tophat_mapping_report", open=F)
Holger Brandl's avatar
Holger Brandl committed
46 47
' | R -q --vanilla
}
Holger Brandl's avatar
Holger Brandl committed
48 49
export -f TophatMappingReport

Holger Brandl's avatar
Holger Brandl committed
50

Holger Brandl's avatar
Holger Brandl committed
51
## replaced with cs_bowtie_qc
Holger Brandl's avatar
Holger Brandl committed
52
#### Bowtie Mapping Report from the logs
Holger Brandl's avatar
Holger Brandl committed
53 54 55
#Bowtie2MappingReport(){
#
#echo '
56 57
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/ggplot_commons.R")
Holger Brandl's avatar
Holger Brandl committed
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
#
#logSuffix=".logs"
#parseAlgnSummary <- function(alignSummary){
#    #alignSummary="./H2Az_Rep1_Lane1_Lib4454.bowtie.log"
#    algnData <- readLines(alignSummary)
#
#    data.frame(
#        condition=trimEnd(basename(alignSummary), logSuffix),
#        num_reads=as.numeric(str_split_fixed(algnData[3], " ", 2)[1]),
#        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[6]), " ", 2)[1]),
#        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[8], " ", 2)[1], "%", "")),
#        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[7]), " ", 2)[1])
#    )
#}
#
#mapStats <- ldply(list.files(".", logSuffix, full.names=TRUE, recursive=T), parseAlgnSummary, .progress="text")
#write.delim(mapStats, file="mapStats.txt")
#
#ggplot(melt(mapStats), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")
#
#ggplot(mapStats, aes(condition, mapping_efficiency)) + geom_bar(stat="identity") +coord_flip() + ylim(0,100) + ggtitle("mapping efficiency")
#ggsave2(p="mapstats")
#ggplot(mapStats, aes(condition, num_reads)) + geom_bar(stat="identity") + coord_flip() + ggtitle("read counts")
#ggsave2(p="mapstats")
#
#ggplot(mapStats, aes(condition, unique_mappers)) + geom_bar(stat="identity") + coord_flip() + ggtitle("unique alignment") + scale_fill_discrete()
#ggsave2(p="mapstats")
#' | R --vanilla
#}
#export -f Bowtie2MappingReport
Holger Brandl's avatar
Holger Brandl committed
89 90


Holger Brandl's avatar
Holger Brandl committed
91 92 93 94 95 96

### Create a cuffdb on a network of lustre file-systen
MakeCuffDB() {
    if [ $# -ne 2 ]; then echo "Usage: MakeCuffDB <gtffile> <genomebuild>"; return; fi

echo '
97
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
Holger Brandl's avatar
Holger Brandl committed
98 99 100 101 102 103 104 105

require.auto(cummeRbund)

createCuffDbTrickyDisk <- function(dbDir, gtfFile, genome, ...){
    tmpdir <- tempfile()
    system(paste("cp -r", dbDir, tmpdir))
    oldWD <- getwd()
    setwd(tmpdir)
106
    cuff <- readCufflinks(rebuild=T, gtf=gtfFile, genome=genome, ...)
Holger Brandl's avatar
Holger Brandl committed
107 108 109 110 111 112 113 114

    system(paste("cp cuffData.db", dbDir))
    system(paste("rm -r", tmpdir))

    setwd(oldWD)
    return(cuff)
}

115
#gtfFile="mm10_ensGene_pc.gtf"; genomeBuild="mm10"
Holger Brandl's avatar
Holger Brandl committed
116 117 118 119 120 121 122
gtfFile=commandArgs(TRUE)[1]
genomeBuild=commandArgs(TRUE)[2]

createCuffDbTrickyDisk(getwd(), gtfFile, genomeBuild)
' | R -q --no-save --no-restore  --args $1 $2
}
export -f MakeCuffDB
Holger Brandl's avatar
Holger Brandl committed
123

Holger Brandl's avatar
Holger Brandl committed
124
CountSeqs(){
Holger Brandl's avatar
Holger Brandl committed
125 126 127
    for fasta in $*; do
        grep ">" $fasta | wc -l | sed 's/^/'$fasta':\t/g'
    done
Holger Brandl's avatar
Holger Brandl committed
128 129
}
export -f CountSeqs
Holger Brandl's avatar
Holger Brandl committed
130

Holger Brandl's avatar
Holger Brandl committed
131 132 133 134 135 136 137

CountFastqGzReads() {
    zcat $1 | grep "^+$" | wc -l  | sed -e s/^/num_reads,/;
};
export -f CountFastqGzReads


Holger Brandl's avatar
Holger Brandl committed
138
IndexBams(){
139 140
    for bamFile in $@; do
        sem -j5 samtools index $bamFile;
Holger Brandl's avatar
Holger Brandl committed
141 142 143
    done
    sem -no-notice --wait
}
Holger Brandl's avatar
Holger Brandl committed
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
export -f IndexBams


##http://www.biostars.org/p/16471/
## estimate blast progress for fasta-query files. Result files are assumed to have the fasta id in column 1
BlastProgress(){
    if [ $# -eq 0 ]; then echo "Usage: BlastProgress <blast_query_fasta>+"; return; fi


   for query in $* ; do
        if [ ! -f $query ]; then
            >&2 echo "Error: $query is not a file"
            return;
        fi

        blast=$query.blast.out

        if [ ! -f $blast ]; then
            >&2 echo "Warning: Could not find blast output file '$blast'"
#            return;
        fi

    #    echo "the blast out is: "$blast
        #echo "the fasta query is: "$query

        #curquery=$(tail -1 $blast | cut -f 1)
        # http://tldp.org/LDP/abs/html/fto.html
        if [ -s $blast ]; then
    #        echo "file exists and has non-zero size"
            curquery=$(tail -1 $blast | cut -f 1)
            curline=$(grep -n $curquery"$" $query |  cut -f 1 -d ':')
        else
    #        echo "file does not yet exist or is empty"
            curline=0
        fi

        nblines=$(wc -l $query | cut -f 1 -d " ")
        percent=$(echo "($curline/$nblines) *100" | bc -l | cut -c 1-4)
        echo "Approximately $percent % of $query were processed."
    done
}
export -f BlastProgress
Holger Brandl's avatar
Holger Brandl committed
186 187


188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
blast_progress(){
kscript - $* <<"EOF"
//DEPS de.mpicbg.scicomp:kutils:0.3
//KOTLIN_OPTS -J-Xmx5g

//kotlinc -J-Xmx5g -cp  $(expandcp.kts de.mpicbg.scicomp:kutils:0.3)

import de.mpicbg.scicomp.bioinfo.openFasta
import java.io.File
import kotlin.system.exitProcess

if(args.size == 0 ){
    System.err.println("Usage: blast_progres <fasta> <blastresults>")
    exitProcess(-1)
}

//args
val fastaFile= File(args[0])
val blastResults= File(args[1])

//val blastResults=File("dd_Smes_v1__vs__gencodeV25P.csv"); val fastaFile = File("../dd_Smes_v1.fasta")

val fastaIds = openFasta(fastaFile).map { it.id }

val procIds = blastResults.useLines { it.map{ it.split("\t")[0]}.distinct().toList()}

//todo part of kutils0.4
fun Double.format(digits: Int) = java.lang.String.format("%.${digits}f", this)


val pcDone = procIds.size.toDouble()/fastaIds.size
println("Approximately ${pcDone.format(2)} % of ${fastaIds.size} sequences were processed by blast.")
EOF
}



Holger Brandl's avatar
Holger Brandl committed
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
## just retains sequences whose id is in id-file (format: 1id per line)
FilterFastaByIDFile(){
python -c '
from Bio import SeqIO
import sys

#http://stackoverflow.com/questions/3925614/how-do-you-read-a-file-into-a-list-in-python
with open(sys.argv[1]) as f:
    some_list = f.read().splitlines()

for record in SeqIO.parse(sys.stdin, "fasta"):
#    recordID=record.description.split(" ")[1]
    recordID=record.id
#    print "processing" + recordID

    # http://stackoverflow.com/questions/3437059/does-python-have-a-string-contains-method
    if recordID in some_list: print record.format("fasta")
' $1;
}
export -f FilterFastaByIDFile