From 6f23dc09668af37ba81324f1a86252962f9d7c4e Mon Sep 17 00:00:00 2001 From: Holger Brandl <brandl@mpi-cbg.de> Date: Wed, 28 Feb 2018 10:50:07 +0100 Subject: [PATCH] added workflow and region model --- .gitignore | 53 ++++ README.md | 20 +- ortho_model_hsap_ppan_ptro.fixed.txt | 35 +++ quantify_offtargets.sh | 412 +++++++++++++++++++++++++++ 4 files changed, 514 insertions(+), 6 deletions(-) create mode 100644 .gitignore create mode 100644 ortho_model_hsap_ppan_ptro.fixed.txt create mode 100644 quantify_offtargets.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a696e38 --- /dev/null +++ b/.gitignore @@ -0,0 +1,53 @@ +# Created by .ignore support plugin (hsz.mobi) +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ +cmake-build-release/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + diff --git a/README.md b/README.md index dece18f..0099e81 100644 --- a/README.md +++ b/README.md @@ -6,22 +6,30 @@ This repo contains scripts used to validate the genomic qPCR data from the paper Marta Florio, Michael Heide, Anneline Pinson, Holger Brandl, Mareike Albert, Sylke Winkler, Pauline Wimberger, Wieland B. Huttner and Michael Hiller<br> ## Materials & Methods summary taken from the publication: -Paired-End data were trimmed using cutadapt (v1.15; -m 20 -q 25 -a file:${Ill_ADAPTERS} -A file:${Ill_ADAPTERS}) and mapped with STAR (v2.5.2b; ---alignSJoverhangMin 100 ---outFilterType BySJout ---sjdbGTFfile ${gtfFile}). bedtools intersect (v2.25.0) was used to determine the number of overlapping alignments at each locus of interest, and samtools flagstat was used to determine the library size. Final data integration and visualization was implemented using R. + +Paired-End data were trimmed using cutadapt (v1.15; `-m 20 -q 25 -a file:${Ill_ADAPTERS} -A file:${Ill_ADAPTERS}`) and mapped with STAR (v2.5.2b; `---alignSJoverhangMin 100 ---outFilterType BySJout ---sjdbGTFfile ${gtfFile}`). bedtools intersect (v2.25.0) was used to determine the number of overlapping alignments at each locus of interest, and samtools flagstat was used to determine the library size. Final data integration and visualization was implemented using R. We share these computational protocols in the spirit of open data and reproducible research. So feel welcome to provide comments, report errors, or suggest improvements. -## Contained Workflow +## Workflow + +[`quantify_offtargets.sh`](./quantify_offtargets.sh) contains all performed steps -**This is currently a place holder and the workflow will be added within the next few days** +1. Data DL, QC, and Trimming +2. Alignment & Locus count intersection +3. Off-target ratio calcuation and reporting + +The used region model is also indluded under [ortho_model_hsap_ppan_ptro.fixed.txt](./ortho_model_hsap_ppan_ptro.fixed.txt) ## Usage & Disclaimer -The scripts are provided as is, without the intention that an interested reader will able to run them directly. We share our protocol here, not a ready-to-use tool. Still, we think they should provide enough technical detail to allow replicating our analysis. +The scripts are provided as is under [MIT License](https://opensource.org/licenses/MIT), without the intention that an interested reader will able to run them directly. We share our protocol here, not a ready-to-use tool. Still, we think they should provide enough technical detail to allow replicating our analysis. Not detailed out in this repo are the steps to prepare a linux environment to include the listed tools and dependencies. Required tools include recent versions of R, samtools, STAR, bedtools, as well as: -* https://git.mpi-cbg.de/bioinfo/ngs_tools - +* https://git.mpi-cbg.de/bioinfo/ngs_tools which is a collection of deep-sequencies utilites. The version used to build the data was `6747f0add32ba9bc41a3cd04de72dad69afdbb6d` +* https://github.com/holgerbrandl/joblist which an HPC task manager +* [rend.R](https://github.com/holgerbrandl/datautils/tree/master/tools/rendr) which is a wrapper around `knitr` ## Support diff --git a/ortho_model_hsap_ppan_ptro.fixed.txt b/ortho_model_hsap_ppan_ptro.fixed.txt new file mode 100644 index 0000000..c1b29a6 --- /dev/null +++ b/ortho_model_hsap_ppan_ptro.fixed.txt @@ -0,0 +1,35 @@ +external_gene_name ensembl_gene_id chromosome_name start_position end_position species +ARHGAP11A ENSG00000198826 15 32615144 32639949 hsap +ARHGAP11B ENSG00000187951 15 30624494 30772993 hsap +FAM72A ENSG00000196550 1 206186179 206204414 hsap +FAM72B ENSG00000188610 1 121167646 121185539 hsap +FAM72C ENSG00000263513 1 143955364 143971965 hsap +FAM72D ENSG00000215784 1 145096000 145112696 hsap +GTF2H2 ENSG00000145736 5 71032670 71067689 hsap +GTF2H2B ENSG00000226259 5 70415352 70448015 hsap +GTF2H2C ENSG00000183474 5 69560208 69594723 hsap +SMN1 ENSG00000172062 5 70925030 70953942 hsap +SMN2 ENSG00000205571 5 70049612 70078522 hsap +STX12 ENSG00000117758 1 27773183 27824452 hsap +ARHGAP11A ENSPTRG00000006867 15 29379131 29403889 ptro +ARHGAP11B NA NA NA NA ptro +FAM72A ENSPTRG00000023098 1 185286022 185302699 ptro +FAM72B NA NA NA NA ptro +FAM72C NA NA NA NA ptro +FAM72D NA NA NA NA ptro +GTF2H2 ENSPTRG00000016953 5 45635974 45669662 ptro +GTF2H2C NA NA NA NA ptro +SMN1 ENSPTRG00000016955 5 45526056 45552822 ptro +SMN2 NA NA NA NA ptro +STX12 ENSPTRG00000000416 1 27781808 27833481 ptro +ARHGAP11A NA 15 29947016 29972748 ppan +ARHGAP11B NA NA NA NA ppan +FAM72A NA 1 186075381 186091543 ppan +FAM72B NA NA NA NA ppan +FAM72C NA NA NA NA ppan +FAM72D NA NA NA NA ppan +GTF2H2 NA 5 45985077 46028618 ppan +GTF2H2C NA NA NA NA ppan +SMN1 NA 5 45876144 45903268 ppan +SMN2 NA NA NA NA ppan +STX12 ENSPPAG00000041257 1 28076625 28130032 ppan \ No newline at end of file diff --git a/quantify_offtargets.sh b/quantify_offtargets.sh new file mode 100644 index 0000000..6d124e3 --- /dev/null +++ b/quantify_offtargets.sh @@ -0,0 +1,412 @@ +export PRJ_NAME="winkler_offtargets" +# screen -R ${PRJ_NAME} + + +export PRJ_DATA=/Volumes/furiosa/bioinfo/data/winkler_offtargets +export BIO_BIN_BASE=~/bin +export NGS_TOOLS="/Volumes/furiosa/bioinfo/brandl/scripts/ngs_tools" +export PRJ_SCRIPTS="/Volumes/furiosa/bioinfo/brandl/scripts/ngs_tools" + +ls "${PRJ_DATA}" "${NGS_TOOLS}" "${PRJ_SCRIPTS}" >/dev/null || { echo "not all project resources are well defined" 1>&2; exit 1; } + + +source ${NGS_TOOLS}/dge_workflow/dge_utils.sh +export PATH=${NGS_TOOLS}/dge_workflow:$PATH + +export JL_FORCE_LOCAL=true + + +######################################################################################################################## +### Fetch the data + +mcdir ${PRJ_DATA}/originals + +## read in credentials for data download +source /net/fileserver-nfs/stornext/snfs4/projects/bioinformatics/tools/bioinfo_resources/bioinfo_secrets.sh +wget -nc --user="${winkler_offtargets_deepseq_user}" --password="${winkler_offtargets_deepseq_pw}" -r --no-directories --no-check-certificate -A "*fastq.gz" https://projects.biotec.tu-dresden.de/ngs-filesharing/sylkew + + +mailme "${PRJ_NAME}: fastq download done" + +### Basic QC +dge_fastqc $(ls *fastq.gz) & + + +######################################################################################################################## +### + +mcdir ${PRJ_DATA}/ortho_loci + +rendr_snippet "build_ortho_loci" <<"EOF" +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/core_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/bio/bioinfo_commons.R") + + +# sort /Users/brandl/Library/Preferences/IntelliJIdea2017.3/scratches/scratch_68.txt | uniq +orthos=c( +"Arhgap11A", +"Arhgap11B", +"FAM72A", "FAM72B", "FAM72C", "FAM72D", +"GTF2H2" , "GTF2H2C", +"smn1", "smn2", +"stx12" +) %>% toupper() + + +#' ## Hsap +hsapGeneInfo = quote({ + mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) + c("ensembl_gene_id", "external_gene_name", "chromosome_name", "start_position", "end_position") %>% + biomaRt::getBM(mart = mart) %>% tbl_df +}) %>% cache_it("hsapGeneInfo") + + +hsapGeneInfo %>% ggplot(aes(chromosome_name)) + geom_bar() + coord_flip() + +## filter away odd non-primary contigs +hsapGeneInfo %<>% filter(!str_detect(chromosome_name, "^(CHR_|GL|KI)")) +hsapGeneInfo %>% count(chromosome_name) %>% print_all + +hsapModel = left_join(data_frame(external_gene_name=orthos), hsapGeneInfo) %T>% { print(knitr::kable(.))} +hsapModel %>% transmute(chromosome_name, start_position, end_position, ensembl_gene_id, score=".", strand=".", external_gene_name) %>% + write_bed(file="hsapiens_orthogenes.ens_aug2017.bed") + + +#' ## Chimpanzee + +ptroGeneInfo = quote({ + mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "ptroglodytes_gene_ensembl", host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) + c("ensembl_gene_id", "external_gene_name", "chromosome_name", "start_position", "end_position") %>% + biomaRt::getBM(mart = mart) %>% tbl_df +}) %>% cache_it("ptroGeneInfo") + + +ptroGeneInfo %>% ggplot(aes(chromosome_name)) + geom_bar() + coord_flip() +ptroGeneInfo %>% count(chromosome_name) %>% print_all + +## filter away odd non-primary contigs +ptroGeneInfo %<>% filter(!str_detect(chromosome_name, "^(CHR_|GL|AACZ)")) +ptroGeneInfo %>% count(chromosome_name) %>% print_all + + +ptroModel = left_join(data_frame(external_gene_name=orthos), ptroGeneInfo) %T>% { print(knitr::kable(.))} +# ptroModel %>% transmute(chromosome_name, start_position, end_position, ensembl_gene_id, score=".", strand=".", external_gene_name) %>% +# filter(!is.na(start_position)) %>% +# write_bed(file="ptroglodytes_orthogenes.ens_aug2017.bed") + + + +#' ## Bonobo + +ppanGeneInfo = quote({ + mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "ppaniscus_gene_ensembl", host = "dec2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) + # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "ppaniscus_gene_ensembl", host = "ensembl.org", path = "/biomart/martservice", archive = FALSE) + # biomaRt::listDatasets(biomaRt::useMart("ensembl")) %>% as_df %>% tbl_df %>% search_df("pan") + # mart = biomaRt::useDataset("ppaniscus_gene_ensembl", mart = biomaRt::useMart("ensembl")) + + c("ensembl_gene_id", "external_gene_name", "chromosome_name", "start_position", "end_position") %>% + biomaRt::getBM(mart = mart) %>% tbl_df +}) %>% cache_it("ppanGeneInfo") + + +ppanGeneInfo %>% ggplot(aes(chromosome_name)) + geom_bar() + coord_flip() + +## filter away odd non-primary contigs +ppanGeneInfo %<>% filter(!str_detect(chromosome_name, "^(KQ|AJ)")) +ppanGeneInfo %>% count(chromosome_name) %>% print_all + + +ppanModel = left_join(data_frame(external_gene_name=orthos), ppanGeneInfo) %T>% { print(knitr::kable(.))} +# ppanModel %>% transmute(chromosome_name, start_position, end_position, ensembl_gene_id, score=".", strand=".", external_gene_name) %>% +# filter(!is.na(start_position)) %>% +# write_bed(file="ppaniscus_orthogenes.ens_dec2017.bed") + +bind_rows( + hsapModel %<>% mutate(species="hsap"), + ptroModel %<>% mutate(species="ptro"), + ppanModel %<>% mutate(species="ppan"), +) %>% write_tsv("ortho_model_hsap_ppan_ptro.txt") + + +session::save.session(".build_ortho_loci.dat"); +# session::restore.session(".build_ortho_loci.dat"); +EOF + +## manually fix the sheet and build bed references for each species + +Rscript - <<"EOF" +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/core_commons.R") +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/bio/bioinfo_commons.R") + +orthoModel = read_tsv("ortho_model_hsap_ppan_ptro.fixed.txt") + +orthoModel %>% + filter(species=="hsap") %>% + transmute(chromosome_name, start_position, end_position, ensembl_gene_id, score=".", strand=".", external_gene_name) %>% + filter(!is.na(start_position)) %>% + write_bed(file="hsapiens_orthogenes.ens_aug2017.bed") + +orthoModel %>% + filter(species=="ptro") %>% + transmute(chromosome_name, start_position, end_position, ensembl_gene_id, score=".", strand=".", external_gene_name) %>% + filter(!is.na(start_position)) %>% + write_bed(file="ptroglodytes_orthogenes.ens_aug2017.bed") + +orthoModel %>% + filter(species=="ppan") %>% + transmute(chromosome_name, start_position, end_position, ensembl_gene_id, score=".", strand=".", external_gene_name) %>% + filter(!is.na(start_position)) %>% + write_bed(file="ppaniscus_orthogenes.ens_dec2017.bed") +EOF + +wc -l *bed + +######################################################################################################################## +### read trimming + + +mcdir ${PRJ_DATA}/trimmed_reads + +#ls ${PRJ_DATA}/originals/*_R1.fastq.gz +#zcat ${PRJ_DATA}/originals/L25039_Track-56814_R1.fastq.gz | head -n 40000 | tail -n 400 > r1_sample.fastq +#zcat ${PRJ_DATA}/originals/L25039_Track-56814_R2.fastq.gz | head -n 40000 | tail -n 400 > r2_sample.fastq + + +#export Ill_ADAPTERS=/projects/bioinfo/common/cutadapt_patterns.fasta +export Ill_ADAPTERS=${NGS_TOOLS}/dge_workflow/common_adapters.fasta +#ll ${Ill_ADAPTERS} + +# http://cutadapt.readthedocs.io/en/stable/guide.html#trimming-paired-end-reads +# http://tucf-genomics.tufts.edu/documents/protocols/TUCF_Understanding_Illumina_TruSeq_Adapters.pdf + +for r1Fastq in $(ls ${PRJ_DATA}/originals/*_R1.fastq.gz) ; do + # DEBUG r1Fastq=${PRJ_DATA}/originals/L25039_Track-56814_R1.fastq.gz + r2Fastq=$(echo $r1Fastq | sed 's/_R1.fastq/_R2.fastq/g') + +# echo "trimming $r1Fastq" + + #todo use a more specific trimming model (trim just correct part on each side without using reverse complements + jl submit -t 5 -j .cajobs -n "${PRJ_NAME}__ca__$(basename $r1Fastq .fastq.gz)" "~/.local/bin/cutadapt --cores=10 -m 20 -q 25 -a file:${Ill_ADAPTERS} -A file:${Ill_ADAPTERS} -o $(basename $r1Fastq .fastq.gz)_ca.fastq.gz -p $(basename $r2Fastq .fastq.gz)_ca.fastq.gz $r1Fastq $r2Fastq" +# echo "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" + +# ~/.local/bin/cutadapt --cores=10 -m 20 -q 25 -a file:${Ill_ADAPTERS} -A file:${Ill_ADAPTERS} -o $(basename $r1Fastq .fastq.gz)_ca.fastq.gz -p $(basename $r2Fastq .fastq.gz)_ca.fastq.gz $r1Fastq $r2Fastq +done + + +jl wait --report --email .cajobs + +dge_fastqc $(ls *fastq.gz) & + +## todo fix this +#rend.r --toc ${NGS_TOOLS}/dge_workflow/cutadapt_summary.R . + +rename _ca "" *fastq.gz + +######################################################################################################################## +### Data Renaming + +mcdir ${PRJ_DATA}/trimmed_renamed + +echo ' +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/core_commons.R") + +sheetFile <- "../originals/sylkew-FC_M01995_0117-2018-1-30.xls" + +#sampleInfo = read_excel(sheetFile) %>% select(SampleName, SampleID) %>% dput +sampleInfoFixed = data_frame( + SampleName = c("s6-SY", "s2-sy", "s5-SY", "s3-SY", "s4-SY", "s1-SY"), + SampleID = c("S26470", "S26466", "S26469", "S26467", "S26468", "S26465"), + SampleSpecies=c("ptro", "ppan", "ppan", "ptro", "hsap", "hsap") +) %>% arrange(SampleName) + +sampleInfoFixed %>% knitr::kable() + +sampleSheet <- read_excel(sheetFile, "Fastqfiles") %>% + select(File, SampleName) %>% + left_join(sampleInfoFixed) %>% + mutate(rWhat=str_extract(File, "_R[12]") ) %>% + mutate(new_sample_name=paste0(tolower(SampleName) %>% str_split_fixed("-", 2) %>% get_col(1), "_", SampleSpecies, rWhat)) + + +write_tsv(sampleSheet, path="renaming_scheme.txt") + + +sampleSheet %>% mutate( + zcat=paste("cp", paste0("../trimmed_renamed/", File), paste0(new_sample_name, ".fastq.gz")) + ) %$% + zcat %>% + write_lines("lane_merge.cmd") +' | R --vanilla -q + +cat lane_merge.cmd | while read line; do + eval ${line} +# jl submit -j .repmerge "$line" +done + +jl wait --email --report + +# +######################################################################################################################### +##### flash the reads +#mcdir ${PRJ_DATA}/flashed +# +#export PATH=/home/brandl/bin/FLASH-1.2.11:${PATH} +#which flash +# +#for file in $(ls ${PRJ_DATA}/trimmed_reads_2/*_R1.fastq.gz); do +# prefix="${file%%_R1*}" +# short_prefix="${prefix##*/}" +# #echo $short_prefix +# flash ${prefix}_R1.fastq.gz ${prefix}_R2.fastq.gz -M 100 --output-prefix=${short_prefix} 2>&1 | tee ${short_prefix}_flash.log +## mv ${short_prefix}* ../3_flash_joined/ +#done +# +# + +######################################################################################################################## +### Alignment Human + +mcdir ${PRJ_DATA}/alignments/homo_sapiens + + +#export IGENOME=/projects/bioinfo/igenomes/Homo_sapiens/Ensembl_v88_custom/GRCh38 +export IGENOME=/projects/bioinfo/igenomes/Homo_sapiens/Ensembl_v88_custom/GRCh38 +ls "${IGENOME}" >/dev/null || { echo "hsap ignome is missing" 1>&2; exit 1; } + +# https://github.com/alexdobin/STAR/issues/346 +#export STAR_PARAMS=" --outFilterMatchNminOverLread 0.3 --outFilterScoreMinOverLread 0.3" + +export STAR_PARAMS="--alignSJoverhangMin 100 --outFilterType BySJout" +star_align.kts ${IGENOME} $(ls ${PRJ_DATA}/trimmed_renamed/*hsap_R1.fastq.gz) 2>&1 | tee star_algin.log + +/Volumes/furiosa/bioinfo/data/winkler_offtargets/ +#ll ${IGENOME}/Sequence/WholeGenomeFasta/genome.fa.fai +dge_bigwig ${IGENOME}/Sequence/WholeGenomeFasta/genome.fa.fai $(ls *bam) & + + +make_igv_session hg38 ${PRJ_DATA}/ortho_loci/hsapiens_orthogenes.ens_aug2017.bed $(ls *bw) $(ls *bam) $(ls ${PRJ_DATA}/offtarget_ratios/peak_regions/*hsap.bed) $(ls ${PRJ_DATA}/ian_data/alignments/*bam)> ortho_pools.igv.xml + +igv ortho_pools.igv.xml + +######################################################################################################################## +### Alignment Pan troglodytes (Chimpanzee) + +mcdir ${PRJ_DATA}/alignments/pan_troglodytes + +export IGENOME=/projects/bioinfo/igenomes/Pan_troglodytes/Ensembl_81/CHIMP2.1.4 +ls "${IGENOME}" >/dev/null || { echo "chimp ignome is missing" 1>&2; exit 1; } + +export STAR_PARAMS="--alignSJoverhangMin 100 --outFilterType BySJout" +star_align.kts ${IGENOME} $(ls ${PRJ_DATA}/trimmed_renamed/*ptro_R1.fastq.gz) 2>&1 | tee star_algin.log + +dge_bigwig ${IGENOME}/Sequence/WholeGenomeFasta/genome.fa.fai $(ls *bam) & + + +######################################################################################################################## +### Alignment Pan paniscus (Bonobo) + +mcdir ${PRJ_DATA}/alignments/pan_paniscus + +export IGENOME=/projects/bioinfo/igenomes/Pan_paniscus/Ensembl_v91_custom/panpan1.1 +ls "${IGENOME}" >/dev/null || { echo "bonobo ignome is missing" 1>&2; exit 1; } + + +export STAR_PARAMS="--alignSJoverhangMin 100 --outFilterType BySJout" +star_align.kts ${IGENOME} $(ls ${PRJ_DATA}/trimmed_renamed/*ppan_R1.fastq.gz) 2>&1 | tee star_algin.log + +dge_bigwig ${IGENOME}/Sequence/WholeGenomeFasta/genome.fa.fai $(ls *bam) & + +######################################################################################################################## +### offtarget ratios + +mcdir ${PRJ_DATA}/offtarget_ratios + +bamFiles="$(ls ${PRJ_DATA}/alignments/homo_sapiens/*bam) $(ls ${PRJ_DATA}/alignments/pan_troglodytes/*bam) $(ls ${PRJ_DATA}/alignments/pan_paniscus/*bam)" + +echo $bamFiles + + + +mkdir peak_regions +for bamFile in $bamFiles; do + # DEBUG bamFile=/projects/bioinfo/data/winkler_offtargets/alignments/homo_sapiens/L25039_Track-56814.bam + echo processing $bamFile + ## build list of coverage islands + # https://www.biostars.org/p/75207/ + #bedtools genomecov -ibam L25044_Track-56819.bam -bg | awk '$4>3' | mergeBed + +# bedtools merge -c 1 -o count -i | awk '$4>3' + bedtools merge -c 1 -o count -i $bamFile | awk '$4>5' > peak_regions/$(basename $bamFile .bam).bed & +done +wait + +wc -l peak_regions/*bed + + +## libary size +# from ${NGS_TOOLS}/chipseq_workflow/chipseq_utils.sh + +rm -f algn_counts.txt +for bamFile in $bamFiles; do +( + echo "processing $bamFile" + samtools flagstat $bamFile | head -n1 | cut -f1 -d' ' | sed -e 's/$/\t'$(basename $bamFile .bam)'/' >> algn_counts.txt +) & +done +wait + + +mkdir ortho_ovlp_counts + +#intersectBed -c -a peak_clusters.bed -b H3HA*narrowPeak > H3HA_merge_ovlp_counts.txt + +#hsap +for bamFile in $(ls ${PRJ_DATA}/alignments/homo_sapiens/*bam); do + intersectBed -split -c -a ${PRJ_DATA}/ortho_loci/hsapiens_orthogenes.ens_aug2017.bed -b ${bamFile} > ortho_ovlp_counts/$(basename ${bamFile} .bam).ovlp_counts.bed +done + + +#ptro +for bamFile in $(ls ${PRJ_DATA}/alignments/pan_troglodytes/*bam); do + intersectBed -split -c -a ${PRJ_DATA}/ortho_loci/ptroglodytes_orthogenes.ens_aug2017.bed -b ${bamFile} > ortho_ovlp_counts/$(basename ${bamFile} .bam).ovlp_counts.bed +done + + +#ppan +for bamFile in $(ls ${PRJ_DATA}/alignments/pan_paniscus/*bam); do + intersectBed -split -c -a ${PRJ_DATA}/ortho_loci/ppaniscus_orthogenes.ens_dec2017.bed -b ${bamFile} > ortho_ovlp_counts/$(basename ${bamFile} .bam).ovlp_counts.bed +done + + +rendr_snippet "ortho_offtarget_ratios" <<"EOF" +devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.46/R/core_commons.R") + +libSizes = read.delim("algn_counts.txt", header=F) %>% set_names(c("library_size", "sample_name")) + +#+ include=FALSE +ovlpCounts = list.files("ortho_ovlp_counts", ".*.ovlp_counts.bed", full.names=T, recursive=T) %>% map_df(function(countsFile){ + # DEBUG countsFile = "ortho_ovlp_counts/s1_hsap.ovlp_counts.bed" + read_tsv(countsFile, col_names=FALSE) %>% select(-X5, -X6) %>% + set_names("chromosome_name", "gene_start", "gene_end", "ensembl_gene_id", "external_gene_name", "num_alignments") %>% + mutate(sample_name=basename(countsFile) %>% trim_ext(".ovlp_counts.bed")) +}) %>% separate(sample_name, c('sample_id', 'species'), sep="_", remove=F) + +#+ +ovlpCounts %>% ggplot(aes(external_gene_name, num_alignments)) + facet_wrap(~species) + geom_bar(stat="identity") + coord_flip() + +ovlpCounts %<>% left_join(libSizes) + +write_tsv(ovlpCounts, "ovlpCounts.txt") + +offtargetSummary = ovlpCounts %>% + group_by(species, sample_name) %>% + summarize(target_total=sum(num_alignments)) %>% + left_join(libSizes) %>% + mutate(target_signal_prop=target_total/library_size) + +write_tsv(offtargetSummary, "offtargetSummary.txt") + +ggplot(offtargetSummary, aes(sample_name,target_signal_prop)) + geom_bar(stat="identity") + facet_wrap(~species, scales="free_x") + ggtitle("pcr primer specificity") + +EOF + -- GitLab