From a3b972c6e48c6cc0be21303562eb94c48a73ddae Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Tue, 23 Jan 2024 18:42:09 +0100 Subject: [PATCH] Moving on on reverse calling --- example/Sequencing/CNVcaller/Makefile | 20 +- example/Sequencing/CNVcaller/functions.hpp | 3860 ++++++++++++++++++++ example/Sequencing/CNVcaller/main.cu | 45 +- example/Sequencing/CNVcaller/testing.cpp | 46 + 4 files changed, 3937 insertions(+), 34 deletions(-) create mode 100644 example/Sequencing/CNVcaller/functions.hpp create mode 100644 example/Sequencing/CNVcaller/testing.cpp diff --git a/example/Sequencing/CNVcaller/Makefile b/example/Sequencing/CNVcaller/Makefile index 821467801..69d8526cd 100644 --- a/example/Sequencing/CNVcaller/Makefile +++ b/example/Sequencing/CNVcaller/Makefile @@ -25,28 +25,24 @@ else endif -OBJ = main.o -OBJ_CN2 = main_CN2.o -OBJ_DEPTH = main_depth.o +OBJ = main.o sm_classes/SequentialImplementation.o sm_classes/Framework.o +OBJ_TESTING = testing.o sm_classes/SequentialImplementation.o sm_classes/Framework.o all: %.o: %.cu - $(CUDA_CC) -O0 -Wno-deprecated-enum-enum-conversion -Wno-interference-size -g $(CUDA_OPTIONS) -DSE_CLASS1 -c --std=c++20 -I/home/i-bird/openfpm_dependencies_new/EIGEN/ -Iseqan3/include/ -Iseqan3/submodules/sdsl-lite/include -Iseqan3/submodules/cereal/include -DSEQAN3_HAS_ZLIB=1 -DSEQAN3_HAS_BZIP2=1 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CUDA_CC) -O0 -Wno-deprecated-enum-enum-conversion -Wno-volatile -Wno-interference-size -g $(CUDA_OPTIONS) -DSE_CLASS1 -c --std=c++20 -I/home/i-bird/openfpm_dependencies_new/EIGEN/ -Iseqan3/include/ -Iseqan3/submodules/sdsl-lite/include -Iseqan3/submodules/cereal/include -DSEQAN3_HAS_ZLIB=1 -DSEQAN3_HAS_BZIP2=1 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp - $(CC) -O0 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CUDA_CC) -O0 -Wno-deprecated-enum-enum-conversion -Wno-volatile -Wno-interference-size -g $(CUDA_OPTIONS) -DSE_CLASS1 -c --std=c++20 -I/home/i-bird/openfpm_dependencies_new/EIGEN/ -Iseqan3/include/ -Iseqan3/submodules/sdsl-lite/include -Iseqan3/submodules/cereal/include -DSEQAN3_HAS_ZLIB=1 -DSEQAN3_HAS_BZIP2=1 -o $@ $< $(INCLUDE_PATH_NVCC) CNV_caller: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) -lz -lbz2 -pthread -CNV_caller_CN2: $(OBJ_CN2) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) -lz -lbz2 -pthread - -CNV_caller_depth: $(OBJ_DEPTH) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) -lz -lbz2 -pthread +testing: $(OBJ_TESTING) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) -lz -lbz2 -pthread -lboost_unit_test_framework -all: CNV_caller CNV_caller_CN2 CNV_caller_depth +all: CNV_caller testing run: CNV_caller mpirun --oversubscribe -np 2 ./CNV_caller @@ -54,5 +50,5 @@ run: CNV_caller .PHONY: clean all run clean: - rm -f *.o *~ core CNV_caller + rm -f *.o *~ sm_classes/*.o core CNV_caller diff --git a/example/Sequencing/CNVcaller/functions.hpp b/example/Sequencing/CNVcaller/functions.hpp new file mode 100644 index 000000000..59210a3c3 --- /dev/null +++ b/example/Sequencing/CNVcaller/functions.hpp @@ -0,0 +1,3860 @@ +#include <Eigen/Dense> +#include "Vector/map_vector.hpp" +#include "VCluster/VCluster.hpp" +#include <boost/math/distributions/poisson.hpp> +#include <gsl/gsl_math.h> +#include <gsl/gsl_vector.h> +#include <gsl/gsl_multimin.h> +#include <boost/math/special_functions/factorials.hpp> +#include <boost/math/distributions/normal.hpp> +#include <htslib/sam.h> +#include <htslib/vcf.h> +#include <seqan3/io/sequence_file/all.hpp> +#include "hash_map/hopscotch_map.h" +#include "hash_map/hopscotch_set.h" +#include "sm_classes/SequentialImplementation.h" + +#define NO_PRINT + +constexpr int chromosome = 0; +constexpr int start_interval = 1; +constexpr int stop_interval = 2; +constexpr int Num_bases = 3; + +namespace CNVcaller +{ + +short int chr_to_chr_int(const char * chr) { + short int chr_int; + char chr_base[3]; + + if (chr[0] == 'c' && chr[1] == 'h' && chr[2] == 'r') { + chr_base[0] = chr[3]; + chr_base[1] = chr[4]; + chr_base[2] = 0; + } + + if (chr[0] == 'X') { + chr_int = 22; + } else if (chr[0] == 'Y') { + chr_int = 23; + } else { + chr_int = (short int)atoi(chr); + if (chr_int == 0) {return -1;} + chr_int -= 1; + } + + return chr_int; +} + +std::string chr_int_to_chr(int chr) { + + if (chr == 23) {return "X";} + if (chr == 24) {return "Y";} + if (chr == 25) {return "MT";} + + return std::to_string(chr); +} + +template<typename T> T CN_factor(const T & CN) { + if (CN > 1.9) { + if (CN <= 2.0) + {return 1.0 - 5.0*(CN-1.9);} + else if (CN > 2.0) { + if (CN < 2.1) { + return 0.5 + 5.0*(CN - 2.0); + } + } + } + + + return 1.0; +} + +template<typename T> T CN_offset(const T & CN) { + return (CN - 2.0f)*(CN - 2.0f); +} + +// template<bool print_terms,int dim, int N_int_opt, int N_sample_opt> +// auto model_function(int selected_interval, +// Eigen::VectorXd & vars, +// Box<dim,double> & domain, +// openfpm::vector<int> & chr_counts, +// openfpm::vector<int> & counts, +// int chromosome, +// float reg) { + +// double val = 0.0; + +// for (int j = 0 ; j < N_sample_opt ; j++) { +// for (int i = selected_interval,k=0 ; i < selected_interval + N_int_opt ; i++, k++ ) { +// if (vars(k) < 0.0 || vars(N_int_opt+j) < 0.0) { +// val += domain.getHigh(0)*domain.getHigh(N_int_opt)*chr_counts.get(chromosome*N_sample_opt+j); +// if (print_terms) {std::cout << "OUT OF BOUND " << vars(k) << std::endl;} +// } else { +// val += fabs(chr_counts.get(chromosome*N_sample_opt+j)*vars(k)*vars(N_int_opt+j) - counts.get(i*N_sample_opt + j))*CN_factor(vars(N_int_opt+j)) + 2.0f*CN_offset(vars(N_int_opt+j)); +// if (print_terms) {std::cout << "External: " << vars(k)*chr_counts.get(chromosome*N_sample_opt+j)*vars(N_int_opt+j) - counts.get(i*N_sample_opt + j) << "*" << CN_factor(vars(N_int_opt+j)) << "+" << 2.0f*CN_offset(vars(N_int_opt+j)) << std::endl;} +// } + + +// // near integer +// int near_int = std::round(vars(N_int_opt+j)); + +// // Penalization for CN not being an integer +// val += fabs(vars(N_int_opt+j) - near_int) * reg; + +// // Penalization for not being near the other regions for fishing +// //val += penalization_fishing(vars(k),fishing_left,fishing_right); +// } +// } + +// // Penalization for out of bound parameters +// for (int i = 0 ; i < N_sample_opt + N_int_opt ; i++) { +// if (vars(i) < domain.getLow(i)) { +// val += (domain.getLow(i) - vars(i))*20000.0; +// } +// if (vars(i) > domain.getHigh(i)) { +// val += (vars(i) - domain.getHigh(i))*20000.0; +// } +// } + +// // Penalization for majority of samples CN bigger than two +// auto majority = (float)N_sample_opt/2.0; +// for (int i = 0 ; i < N_sample_opt ; i++) { +// majority -= (vars(N_int_opt+i) - 2); +// } +// if (majority < 0) { +// majority = fabs(majority); +// } else { +// majority = 0.0; +// } +// val += majority*1000.0; + + +// //std::cout << "VALUE " << val << " " << vars(0) << " " << vars(1) << " " << vars(2) << vars(3) << " " << vars(4) << " " << vars(5) << " || " << vars(6) << " " << vars(7) << std::endl; + +// #ifdef DEBUG_PRINT +// std::cout << "VALUE " << val; +// for (int i = 0 ; i < N_sample_opt + N_int_opt; i++) { +// std::cout << vars(i) +// } +// std::cout << std::endl; +// #endif + +// return val; +// } + + +// Function to calculate the volume of the Poisson distribution in an interval [start, end] +double poisson_distribution(double lambda, int ele) { + if (lambda == 0) { + if (ele == 0) {return 1.0;} + else {return std::numeric_limits<double>::min();} + } + if (lambda < 0.0) {return 1e-13;} + + boost::math::poisson_distribution<double> pd((double)lambda); + + return boost::math::pdf(pd,ele); +} + +// Function to calculate the volume of the Poisson distribution in an interval [start, end] +double poisson_distribution_volume(int lambda, int start, int end) { + // We will assume that the range is reasonably small so that the calculation remains efficient. + double volume = 0.0; + for (int i = start; i <= end; ++i) { + // Calculating the probability of exactly i events + volume += poisson_distribution(lambda,i); + } + return volume; +} + +template<bool print_terms,int dim, int N_int_opt, int N_sample_opt> +auto model_function(int selected_interval, + Eigen::VectorXd & vars, + Box<dim,double> & domain, + openfpm::vector<int> & chr_counts, + openfpm::vector<int> & counts, + int chromosome, + float reg) { + + double val = 0.0; + + for (int j = 0 ; j < N_sample_opt ; j++) { + for (int i = selected_interval,k=0 ; i < selected_interval + N_int_opt ; i++, k++ ) { + if (vars(k) < 0.0 || vars(N_int_opt+j) < 0.0) { + val += 1000000.0; + if (print_terms) {std::cout << "OUT OF BOUND " << vars(k) << std::endl;} + } else { + if (chr_counts.get(chromosome*N_sample_opt+j)*vars(k)*vars(N_int_opt+j) > 1000000 || + chr_counts.get(chromosome*N_sample_opt+j)*vars(k)*vars(N_int_opt+j) < 0.0) + { + val += 1000.0; + if (print_terms) {std::cout << "External: " << 1000.0 << std::endl;} + } + else { + auto pv = poisson_distribution(chr_counts.get(chromosome*N_sample_opt+j)*vars(k)*vars(N_int_opt+j),counts.get(i*N_sample_opt + j)); + if (pv == 0) { + val += 1000.0; + if (print_terms) {std::cout << "External: " << 1000.0 << std::endl;} + } else { + val += -std::log(pv)*CN_factor(vars(N_int_opt+j)) + 2.0f*CN_offset(vars(N_int_opt+j)); + if (print_terms) {std::cout << "External: " << -std::log(poisson_distribution(chr_counts.get(chromosome*N_sample_opt+j)*vars(k)*vars(N_int_opt+j),counts.get(i*N_sample_opt + j))) << "*" << CN_factor(vars(N_int_opt+j)) << "+" << 2.0f*CN_offset(vars(N_int_opt+j)) << std::endl;} + } + } + } + + + // near integer + int near_int = std::round(vars(N_int_opt+j)); + + // Penalization for CN not being an integer + val += fabs(vars(N_int_opt+j) - near_int) * reg; + + // Penalization for not being near the other regions for fishing + //val += penalization_fishing(vars(k),fishing_left,fishing_right); + } + } + + // Penalization for out of bound parameters + for (int i = 0 ; i < N_sample_opt + N_int_opt ; i++) { + if (vars(i) < domain.getLow(i)) { + val += (domain.getLow(i) - vars(i))*20000.0; + } + if (vars(i) > domain.getHigh(i)) { + val += (vars(i) - domain.getHigh(i))*20000.0; + } + } + + // Penalization for majority of samples CN bigger than two + auto majority = (float)N_sample_opt/2.0; + for (int i = 0 ; i < N_sample_opt ; i++) { + majority -= (vars(N_int_opt+i) - 2); + } + if (majority < 0) { + majority = fabs(majority); + } else { + majority = 0.0; + } + val += majority*1000.0; + + + //std::cout << "VALUE " << val << " " << vars(0) << " " << vars(1) << " " << vars(2) << vars(3) << " " << vars(4) << " " << vars(5) << " || " << vars(6) << " " << vars(7) << std::endl; + + #ifdef DEBUG_PRINT + std::cout << "VALUE " << val; + for (int i = 0 ; i < N_sample_opt + N_int_opt; i++) { + std::cout << vars(i) + } + std::cout << std::endl; + #endif + + return val; +} + + +template<unsigned int N_int_opt, unsigned int N_sample_opt> +auto model_function_confidence(int selected_interval, + int selected_sample, + Eigen::VectorXd & solution, + openfpm::vector<int> & chr_counts, + openfpm::vector<int> & counts, + int chromosome) { + + double prob = 1.0; + int j = selected_sample; + + auto solution_cn = std::round(solution(N_int_opt + selected_sample)); + + if (solution_cn == 2) {return -1.0;} + + auto estimated_cn2 = solution(0)*chr_counts.get(chromosome*N_sample_opt+j)*2; + auto estimated_solution_cn = solution(0)*chr_counts.get(chromosome*N_sample_opt+j)*solution_cn; + auto estimated_solution_cn_next = solution(0)*chr_counts.get(chromosome*N_sample_opt+j)*(solution_cn+1); + + + // the start is given when the external energy function transition + + int start = 0; + int end = 0; + + // solution of the equation: fabs(start - estimated_cn2)*CN_factor(2) = fabs(estimated_solution_cn - start)*CN_factor(solution_cn) + if (solution_cn > 2) { + // solution of the equation: (start - estimated_cn2)*CN_factor(2) = (estimated_solution_cn - start)*CN_factor(solution_cn); + start = (estimated_solution_cn*CN_factor(solution_cn) + estimated_cn2*CN_factor(2)) / (CN_factor(2) + CN_factor(solution_cn)); + end = (estimated_solution_cn_next*CN_factor(solution_cn+1) + estimated_cn2*CN_factor(2)) / (CN_factor(2) + CN_factor(solution_cn+1)); + std::cout << "START: " << start << " END: " << end << std::endl; + } else { + // solution of the equation: (estimated_cn2 - start)*CN_factor(2) = (start - estimated_solution_cn)*CN_factor(solution_cn); + start = (estimated_solution_cn*CN_factor(solution_cn-1) + estimated_cn2*CN_factor(2)) / (CN_factor(2) + CN_factor(solution_cn-1)); + end = (estimated_solution_cn_next*CN_factor(solution_cn) + estimated_cn2*CN_factor(2)) / (CN_factor(2) + CN_factor(solution_cn)); + std::cout << "START: " << start << " END: " << end << std::endl; + } + + // the stop is given when the external energy function + + //auto prop = poisson_distribution_volume(estimated_cn2,start,end); + + return -1.0; +} + +template<bool print_terms,int dim, int N_int_opt, int N_sample_opt> +auto model_function_CN2(int selected_interval, + Eigen::VectorXd & vars, + Box<dim,double> & domain, + openfpm::vector<int> & chr_counts, + openfpm::vector<int> & counts, + int chromosome, + float reg) { + + double val = 0.0; + + for (int j = 0 ; j < N_sample_opt ; j++) { + for (int i = selected_interval,k=0 ; i < selected_interval + N_int_opt ; i++, k++ ) { + if (vars(k) < 0.0) { + val += domain.getHigh(0)*6.0*chr_counts.get(chromosome*N_sample_opt+j); + if (print_terms) {std::cout << "OUT OF BOUND" << std::endl;} + } else { + val += fabs(vars(k)*chr_counts.get(chromosome*N_sample_opt+j)*2.0f - counts.get(i*N_sample_opt + j))*CN_factor(2.0f) + 2.0f*CN_offset(2.0f); + if (print_terms) {std::cout << "External: " << vars(k)*chr_counts.get(chromosome*N_sample_opt+j)*2.0f - counts.get(i*N_sample_opt + j) << "*" << CN_factor(2.0f) << "+" << 2.0f*CN_offset(2.0f) << std::endl;} + } + + // Penalization for not being near the other regions for fishing + //val += penalization_fishing(vars(k),fishing_left,fishing_right); + } + } + + return val; +} + +void add_to_interval(openfpm::vector<int> & base_count, + openfpm::vector<aggregate<short int,int,int>> & intervals, + int interval, + int sample, + int N_samples, + int pos, + int num) { + + int overlap_start = std::max(pos,intervals.template get<start_interval>(interval)); + int overlap_stop = std::min(pos+num,intervals.template get<stop_interval>(interval)); + + base_count.get<0>(N_samples*interval + sample) += std::max(overlap_stop - overlap_start,0); +} + +void add_to_interval_hist(openfpm::vector<openfpm::vector<int>> & histogram, + openfpm::vector<aggregate<short int,int,int>> & intervals, + int interval, + int sample, + int N_samples, + int pos, + int num) { + + int overlap_start = std::max(pos,intervals.template get<start_interval>(interval)); + int overlap_stop = std::min(pos+num,intervals.template get<stop_interval>(interval)); + + for (int i = overlap_start-intervals.template get<start_interval>(interval) ; i < overlap_stop - overlap_start; i++) { + histogram.get(interval).get(i)++; + } + +} + +void find_interval(openfpm::vector<aggregate<short int,int,int>> & intervals, short int chrom_t, int start_t , int & interval) { + if (interval != -1 && + intervals.get<chromosome>(interval) == chrom_t && + start_t < intervals.get<stop_interval>(interval) && + start_t >= intervals.get<start_interval>(interval) ) + {return;} + + interval = 0; + int end = intervals.size() - 1; + + while (end != interval) { + int mid; + if (end - interval == 1) {mid = end; interval = mid;} + else + {mid = (end - interval) / 2 + interval;} + short int mid_chromosome = intervals.get<chromosome>(mid); + int mid_start = intervals.get<start_interval>(mid); + int mid_stop = intervals.get<stop_interval>(mid); + + short int i_chromosome = intervals.get<chromosome>(interval); + int i_start = intervals.get<start_interval>(interval); + int i_stop = intervals.get<stop_interval>(interval); + + if (chrom_t < mid_chromosome) {end = mid; continue;} + if (chrom_t > mid_chromosome) {interval = mid; continue;} + + if (start_t < mid_start) {end = mid; continue;} + if (start_t >= mid_stop) {interval = mid; continue;} + + interval = mid; + return; + } + + interval = -1; + return; +} + +constexpr int READ_NUM_BASES = 0; +constexpr int READ_DEPTH = 1; + +void read_sample_num_bases(std::string bam_file, + openfpm::vector<int> & base_count, + openfpm::vector<int> & chr_count, + openfpm::vector<aggregate<short int,int,int>> & intervals, + int sample, + int N_samples) { + + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + chr_count.get(N_samples*chr_int + sample) += len; + + int j = 0; + int interval = 0; + int interval_end = 0; + + + + for (int k = 0 ; k < n_cigar ; k++) { + int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + find_interval(intervals,chr_int,pos,interval); + find_interval(intervals,chr_int,pos+len,interval_end); + if (op == 'M') { + if (interval != -1) { + add_to_interval(base_count,intervals,interval,sample,N_samples,pos,len); + } + if (interval_end != -1) { + add_to_interval(base_count,intervals,interval_end,sample,N_samples,pos,len); + } + } + else if (op == 'D' || op == 'N') {pos += len;} + else { + pos += len; + } + } + } + + bam_destroy1(aln); + sam_close(fp_in); +} + +std::string basename(const std::string &path) { + size_t found = path.find_last_of("/"); + if (found != std::string::npos) { + return path.substr(found + 1); + } + return path; +} + +bool create_histogram(openfpm::vector<openfpm::vector<int>> & histogram, + openfpm::vector<aggregate<short int,int,int>> & intervals, + int chromosome) { + bool empty = true; + histogram.resize(intervals.size()); + for (int i = 0 ; i < intervals.size(); i++) { + if (intervals.get<0>(i) != chromosome) {continue;} + + histogram.get(i).resize(intervals.get<2>(i) - intervals.get<1>(i)); + empty = false; + } + + return empty; +} + +struct params_sum_poisson { + int N_int_opt; + int N_sample_opt; + int interval; + int chromosome; + openfpm::vector<int> *CNs; + openfpm::vector<int> * chr_counts; + openfpm::vector<int> * counts; +}; + +/* Paraboloid centered on (p[0],p[1]), with + scale factors (p[2],p[3]) and minimum p[4] */ + +double sum_poisson_f (const gsl_vector *v, void *params_) +{ + struct params_sum_poisson * p = static_cast<struct params_sum_poisson *>(params_); + openfpm::vector<int> * CNs = static_cast<openfpm::vector<int> *>(p->CNs); + openfpm::vector<int> * chr_counts = static_cast<openfpm::vector<int> *>(p->chr_counts); + openfpm::vector<int> * counts = static_cast<openfpm::vector<int> *>(p->counts); + + int N_int_opt = p->N_int_opt; + int N_sample_opt = p->N_sample_opt; + int interval = p->interval; + int chromosome = p->chromosome; + + double x; + double y = 0.0; + + x = gsl_vector_get(v, 0); + + for (int i = 0 ; i < CNs->size() ; i++) { + auto pv = poisson_distribution(x*CNs->get(i)*chr_counts->get(chromosome*N_sample_opt + i),counts->get(N_sample_opt*interval + i)); + y += -std::log(pv)*CN_factor((double)CNs->get(i)) + 2.0f*CN_offset((double)CNs->get(i)); + } + return y; +} + +/* The gradient of f, df = (df/dx, df/dy). */ +void sum_poisson_df (const gsl_vector *v, void *params_, + gsl_vector *df) +{ + struct params_sum_poisson * p = static_cast<struct params_sum_poisson *>(params_); + openfpm::vector<int> * CNs = static_cast<openfpm::vector<int> *>(p->CNs); + openfpm::vector<int> * chr_counts = static_cast<openfpm::vector<int> *>(p->chr_counts); + openfpm::vector<int> * counts = static_cast<openfpm::vector<int> *>(p->counts); + + int N_int_opt = p->N_int_opt; + int N_sample_opt = p->N_sample_opt; + int interval = p->interval; + int chromosome = p->chromosome; + + double x; + double y = 0.0; + + x = gsl_vector_get(v, 0); + + for (int i = 0 ; i < CNs->size() ; i++) { + y += -1.0 / poisson_distribution(x*CNs->get(i)*chr_counts->get(chromosome*N_sample_opt + i),counts->get(N_sample_opt*interval + i)) * \ + poisson_distribution(x*CNs->get(i)*chr_counts->get(chromosome*N_sample_opt + i),counts->get(N_sample_opt*interval + i)) / x * (counts->get(N_sample_opt*interval + i) - x*CNs->get(i)*chr_counts->get(chromosome*N_sample_opt + i)) * \ + (CN_factor((double)CNs->get(i)) + 2.0f*CN_offset((double)CNs->get(i))); + } + + gsl_vector_set(df, 0, y); +} + +/* Compute both f and df together. */ +void sum_poisson_fdf (const gsl_vector *x, void *params, + double *f, gsl_vector *df) +{ + *f = sum_poisson_f(x, params); + sum_poisson_df(x, params, df); +} + + +double calculateMean(const openfpm::vector<double>& data) { + double sum = 0.0; + for (int i = 0 ; i < data.size(); i++) { + sum += data.get(i); + } + return sum / data.size(); +} + +double calculateStandardDeviation(const openfpm::vector<double>& data) { + double mean = calculateMean(data); + double sumSquaredDifferences = 0.0; + + for (int i = 0 ; i < data.size(); i++ ) { + double difference = data.get(i) - mean; + sumSquaredDifferences += difference * difference; + } + + double standardDev = std::sqrt(sumSquaredDifferences / (data.size() - 1)); + + return standardDev; +} + + +openfpm::vector<double> centroids; +openfpm::vector<int> nPoints; +openfpm::vector<double> sumX; + +struct min_d { + double dist; + int cluster; +}; + +openfpm::vector<min_d> minDist; + +double kmean_1d_clustering_mean_bigger(openfpm::vector<double> & points, int k, int epochs = 100) { + int n = points.size(); + + centroids.clear(); + nPoints.clear(); + sumX.clear(); + minDist.clear(); + + // Randomly initialise centroids + // The index of the centroid within the centroids vector + // represents the cluster label. + + minDist.resize(points.size()); + for (int i = 0; i < k; ++i) { + centroids.add(points.get(rand() % n)); + } + centroids.sort(); + + for (int i = 0 ; i < minDist.size() ; i++) { + minDist.get(i).dist = std::numeric_limits<double>::max(); + } + + for (int i = 0; i < epochs; ++i) { + // For each centroid, compute distance from centroid to each point + // and update point's cluster if necessary + for (int c = 0; c < centroids.size() ; ++c) { + for (int it = 0; it != points.size(); ++it) { + double p = points.get(it); + double dist = fabs(centroids.get(c) - p); + if (dist < minDist.get(it).dist) { + minDist.get(it).dist = dist; + minDist.get(it).cluster = c; + } + } + } + + nPoints.clear(); + sumX.clear(); + // Create vectors to keep track of data needed to compute means + for (int j = 0; j < k; ++j) { + nPoints.add(0); + sumX.add(0.0); + } + + // Iterate over points to append data to centroids + for (int it = 0; it < points.size() ; ++it) { + int clusterId = minDist.get(it).cluster; + nPoints.get(clusterId) += 1; + sumX.get(clusterId) += points.get(it); + + minDist.get(it).dist = std::numeric_limits<double>::max(); // reset distance + } + // Compute the new centroids + for (int c = 0; c < centroids.size() ; ++c) { + centroids.get(c) = sumX.get(c) / nPoints.get(c); + } + } + + // return the mean of the bigger cluster + + int biggest_k = 0; + int b_c = 0; + for (int i = 0 ; i < k; i++) { + if (nPoints.get(i) >= biggest_k) { + biggest_k = nPoints.get(i); + b_c = i; + } + } + + return centroids.get(b_c); +} + +template<unsigned int N_int_opt, unsigned int N_sample_opt> +void optimize_fast(std::ofstream & output, + Box<N_sample_opt+N_int_opt,double> & domain, + double target, + openfpm::vector<double> & sol, + openfpm::vector<double> & confidence, + openfpm::vector<int> & counts, + openfpm::vector<int> & chr_count, + openfpm::vector<openfpm::vector<double>> & window_noise, + openfpm::vector<int> & window_noise_pointer, + int interval, + int chromosome, + int n_intervals) { + + constexpr int dim = N_int_opt + N_sample_opt; + + sol.resize(N_int_opt + N_sample_opt); + confidence.resize(2*(N_int_opt + N_sample_opt)); + openfpm::vector<int> CNs; + openfpm::vector<aggregate<float[6]>> CNs_lhood; + + openfpm::vector<double> Eff_samples; + + Eff_samples.resize(N_sample_opt); + CNs.resize(N_sample_opt); + CNs_lhood.resize(N_sample_opt); + + // First we estimate the efficiency of the bait on CN=2 + double val_prev = 0.0; + + // more than N_sample_opt/2 samples must have depth higher than 30 + int ns_big_30 = 0; + int chr_count_zero = 0; + + // we calculate the best lamda for the sample + for (int j = 0 ; j < N_sample_opt ; j++) { + Eff_samples.get(j) = counts.get(interval*N_sample_opt + j) / 2.0 / chr_count.get(chromosome*N_sample_opt+j); + if (counts.get(interval*N_sample_opt + j) > 15) { + ns_big_30++; + } + if (chr_count.get(chromosome*N_sample_opt+j) == 0) { + chr_count_zero++; + } + } + + if (ns_big_30 < N_sample_opt / 2 || chr_count_zero != 0) { + sol.get(0) = -1.0; + for (int i = 0 ; i < N_sample_opt ; i++) { + sol.get(i+1) = 2; + } + return; + } + + // Take the nearest N_sample_opt / 2 + Eff_samples.sort(); + + // find the smallest in right-left NN distance + double distance = 10000000.0; + int best_eff = -1; + for (int i = 1 ; i < Eff_samples.size() - 1 ; i++) { + if (Eff_samples.get(i) < 0.0001) {continue;} + double distance_left = Eff_samples.get(i) - Eff_samples.get(i-1); + double distance_right = Eff_samples.get(i+1) - Eff_samples.get(i); + if (distance_right + distance_left < distance ) { + best_eff = i; + distance = distance_right + distance_left; + } + } + + // Calculate the efficency as average of the near N_samples_opt / 2 +/* int left_pointer = best_eff - 1; + int right_pointer = best_eff + 1; + double accumulate = Eff_samples.get(best_eff); + for (int k = 0 ; k < N_sample_opt / 2; k++) { + if (Eff_samples.get(best_eff) - Eff_samples.get(left_pointer) < Eff_samples.get(right_pointer) - Eff_samples.get(best_eff)) { + accumulate += Eff_samples.get(left_pointer); + left_pointer -= 1; + } else { + accumulate += Eff_samples.get(right_pointer); + right_pointer += 1; + } + }*/ + +// accumulate /= right_pointer - left_pointer - 1; + + double accumulate = kmean_1d_clustering_mean_bigger(Eff_samples,2); + + if (accumulate > 5.0) { + sol.get(0) = -1.0; + for (int i = 0 ; i < N_sample_opt ; i++) { + sol.get(i+1) = 2; + } + return; + } + + #ifndef NO_PRINT + std::cout << std::endl; + #endif + + // using this efficency we find the best CN for all the samples + for (int j = 0 ; j < N_sample_opt ; j++) { + int best_CN = 0; + double best_pv = 5000.0; + for (int CN = 0 ; CN < 6 ; CN++) { + auto pv = poisson_distribution(chr_count.get(chromosome*N_sample_opt+j)*accumulate*CN,counts.get(interval*N_sample_opt + j)); + pv = -std::log(pv)*CN_factor((double)CN) + 2.0f*CN_offset((double)CN); + if (pv < best_pv) { + best_pv = pv; + best_CN = CN; + } + } + CNs.get(j) = best_CN; + } + + gsl_vector * x = gsl_vector_alloc (1); + // while is unstable + for (int opt_s = 0 ; opt_s < 10 ; opt_s++) { + const gsl_multimin_fdfminimizer_type * T; + gsl_multimin_fdfminimizer *s; + + // Finaly we optimize the efficency of the bait as summation of poisson + gsl_multimin_function_fdf poisson; + + struct params_sum_poisson p; + p.N_sample_opt = N_sample_opt; + p.N_int_opt = N_int_opt; + + p.CNs = &CNs; + p.chr_counts = &chr_count; + p.counts = &counts; + p.interval = interval; + p.chromosome = chromosome; + + poisson.n = 1; /* number of function components */ + poisson.f = &sum_poisson_f; + poisson.df = &sum_poisson_df; + poisson.fdf = &sum_poisson_fdf; + poisson.params = (void *)&p; + + gsl_vector_set (x, 0, accumulate); + + T = gsl_multimin_fdfminimizer_conjugate_fr; + s = gsl_multimin_fdfminimizer_alloc (T, 1); + + gsl_multimin_fdfminimizer_set (s, &poisson, x, 0.01, 1e-4); + + int status = 0; + + int iter = 0; + do + { + iter++; + status = gsl_multimin_fdfminimizer_iterate (s); + + if (status) + break; + + status = gsl_multimin_test_gradient (s->gradient, 1e-3); + } + while (status == GSL_CONTINUE && iter < 100); + + accumulate = gsl_vector_get (s->x, 0); + + gsl_multimin_fdfminimizer_free (s); + + bool is_stable = true; + // using this efficency we find the best CN for all the samples + for (int j = 0 ; j < N_sample_opt ; j++) { + int best_CN = 0; + double best_pv = 5000.0; + for (int CN = 0 ; CN < 6 ; CN++) { + auto pv = poisson_distribution(chr_count.get(chromosome*N_sample_opt+j)*accumulate*CN,counts.get(interval*N_sample_opt + j)); + pv = -std::log(pv)*CN_factor((double)CN) + 2.0f*CN_offset((double)CN); + CNs_lhood.get<0>(j)[CN] = pv; + if (pv < best_pv) { + best_pv = pv; + best_CN = CN; + } + } + if (CNs.get(j) != best_CN) { + CNs.get(j) = best_CN; + is_stable = false; + } + } + + if (is_stable == true) {break;} + } + + gsl_vector_free(x); + + output << interval << "\t"; + for (int i = 0 ; i < N_sample_opt ; i++) { + output << CNs_lhood.get<0>(i)[0]; + for (int j = 1 ; j < 6 ; j++) { + output << ":" << CNs_lhood.get<0>(i)[j]; + } + output << "\t"; + } + output << "\n"; + + sol.get(0) = accumulate; + confidence.get(0) = 0; + #ifndef NO_PRINT + std::cout << "CN_cont: "; + #endif + for (int i = 0 ; i < CNs.size() ; i++) { + sol.get(i+N_int_opt) = CNs.get(i); + + // Confidence is calculated as the probability of the number CN=2, multiplied by the number of intervals + if (CNs.get(i) != 2) { + double p_CN = poisson_distribution(chr_count.get(chromosome*N_sample_opt+i)*accumulate*CNs.get(i),counts.get(interval*N_sample_opt + i)); + double p_CN2 = poisson_distribution(chr_count.get(chromosome*N_sample_opt+i)*accumulate*2.0,counts.get(interval*N_sample_opt + i)); + confidence.get(2*(i+N_int_opt)) = p_CN2 / (p_CN + p_CN2) * n_intervals; + + confidence.get(2*(i+N_int_opt) + 1) = 0.0; + #ifndef NO_PRINT + std::cout << CN_cont << "(" << window_noise_pointer.get(i) << ")" << " "; + #endif + } + else { + double CN_cont = (double)counts.get(interval*N_sample_opt + i) / (double)chr_count.get(chromosome*N_sample_opt+i) / accumulate; + window_noise.get(i).get(window_noise_pointer.get(i)) = CN_cont; + #ifndef NO_PRINT + std::cout << CN_cont << "(" << window_noise_pointer.get(i) << ")" << " "; + #endif + double error = calculateStandardDeviation(window_noise.get(i)); + confidence.get(2*(i+N_int_opt)) = 0.0; + confidence.get(2*(i+N_int_opt)+1) = 0.0; + window_noise_pointer.get(i) = (window_noise_pointer.get(i) + 1)%30; + } + } + #ifndef NO_PRINT + std::cout << std::endl; + #endif +} + + +void read_sample_depth(std::string bam_file, + openfpm::vector<int> & base_count, + openfpm::vector<int> & chr_count, + openfpm::vector<aggregate<short int,int,int>> & intervals, + int sample, + int N_samples) { + + + // for all chromosomes + for (int i = 0 ; i < 25 ; i++) { + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + openfpm::vector<openfpm::vector<int>> intervals_hist; + intervals_hist.resize(intervals.size()); + bool empty = create_histogram(intervals_hist,intervals,i); + + if (empty == true) {continue;} + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + if (chr_int != i) {continue;}; + + chr_count.get(N_samples*chr_int + sample) += len; + + int j = 0; + int interval = 0; + int interval_end = 0; + + for (int k = 0 ; k < n_cigar ; k++) { + int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + find_interval(intervals,chr_int,pos,interval); + find_interval(intervals,chr_int,pos+len,interval_end); + if (op == 'M') { + if (interval != -1) { + add_to_interval_hist(intervals_hist,intervals,interval,sample,N_samples,pos,len); + } + if (interval_end != -1) { + add_to_interval_hist(intervals_hist,intervals,interval_end,sample,N_samples,pos,len); + } + } + else if (op == 'D' || op == 'N') {pos += len;} + else { + pos += len; + } + } + } + + bam_destroy1(aln); + sam_close(fp_in); + + // Now reconstruct max depth for each interval + + for (int j = 0 ; j < intervals.size() ; j++) { + if (intervals.get<0>(j) != i) {continue;} + + int max_depth = 0; + + for (int k = 0; k < intervals_hist.get(j).size() ; k++) { + if (intervals_hist.get(j).get(k) > max_depth) { + max_depth = intervals_hist.get(j).get(k); + } + } + base_count.get(N_samples*j + sample) = max_depth; + } + } +} + +void read_sample(std::string bam_file, + openfpm::vector<int> & base_count, + openfpm::vector<int> & chr_count, + openfpm::vector<aggregate<short int,int,int>> & intervals, + int sample, + int N_samples, + int options = READ_NUM_BASES) { + + if (options == READ_NUM_BASES) { + read_sample_num_bases(bam_file,base_count,chr_count,intervals,sample,N_samples); + } else { + read_sample_depth(bam_file,base_count,chr_count,intervals,sample,N_samples); + } + +} + +void initialize_counts(openfpm::vector<int> & counts, + openfpm::vector<int> & chr_counts, + openfpm::vector<aggregate<short int,int,int>> & intervals, + openfpm::vector<std::string> & files_bams, + int N_samples, + int N_chr) { + + counts.resize(N_samples*intervals.size()); + chr_counts.resize(N_chr*N_samples); + + for (int i = 0 ; i < files_bams.size() ; i++) { + std::cout << "READING: " << files_bams.get(i) << std::endl; + read_sample(files_bams.get(i),counts,chr_counts,intervals,i,N_samples); + } + + // count by depth + int count = 0; + for (int j = 0 ; j < intervals.size() ; j++) { + for (int i = 0 ; i < N_samples ; i++) { + counts.get(N_samples*j + i) /= intervals.get<2>(j) - intervals.get<1>(j); + } + } + + // Calculate the number of bases in the interval + + int n_bases_chr[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + for (int i = 0 ; i < intervals.size() ; i++) { + n_bases_chr[intervals.get<0>(i)] += intervals.get<2>(i) - intervals.get<1>(i); + } + + //int chromosome = 4; + // Divide chromosome to get max_count + + for (int c = 0 ; c < N_chr ; c++) { + std::cout << "NBASES: " << n_bases_chr[c] << std::endl; + for (int j = 0 ; j < N_samples ; j++) { + chr_counts.get(c*N_samples+j)/=(double)n_bases_chr[c]; + } + } +} + +template<unsigned int N_sample_opt, unsigned int N_int_opt> +void initialize_counts_chr_counts_int(openfpm::vector<int> & chr_counts_int, + openfpm::vector<int> & counts, + openfpm::vector<aggregate<float[N_sample_opt+2]>> & solutions, + openfpm::vector<aggregate<short int,int,int>> & intervals) { + int N_samples = N_sample_opt; + + // Tranform count to absolute count + // count by depth + int count = 0; + for (int j = 0 ; j < intervals.size() ; j++) { + for (int i = 0 ; i < N_samples ; i++) { + counts.get(N_samples*j + i) *= (intervals.get<2>(j) - intervals.get<1>(j)); + } + } + + + chr_counts_int.resize(N_samples*intervals.size()); + + for (int j = 0 ; j < intervals.size() ; j++) { + + if (j == 215684) { + int debug = 0; + debug++; + } + + for (int i = 0 ; i < N_samples ; i++) { + int loc_chr_count = 0; + double norm = 0.0; + int n_bases = 0.0; + int chromosome = intervals.get<0>(j); + + int n_backward_interval = 0; + int backward_pointer = 1; + + while (n_backward_interval < 2 && j - backward_pointer >= 0) { + bool valid = true; + for (int k = 0 ; k < N_sample_opt ; k++) { + valid &= (solutions.template get<0>(j-backward_pointer)[k+1] == 2); + } + if (intervals.get<0>(j-backward_pointer) == chromosome && valid) { + n_backward_interval++; + loc_chr_count += counts.get(N_samples*(j-backward_pointer) + i) * 0.5 / n_backward_interval; + norm += 0.50 / n_backward_interval; + n_bases += (intervals.get<2>(j-backward_pointer) - intervals.get<1>(j-backward_pointer))*0.5; + } + backward_pointer++; + } + + int n_forward_interval = 0; + int forward_pointer = 1; + + while (n_forward_interval < 2 && j + forward_pointer < intervals.size()) { + bool valid = true; + for (int k = 0 ; k < N_sample_opt ; k++) { + valid &= (solutions.template get<0>(j+forward_pointer)[k+1] == 2); + } + if (intervals.get<0>(j+forward_pointer) == chromosome && valid) { + n_forward_interval++; + loc_chr_count += counts.get(N_samples*(j+forward_pointer) + i) * 0.5 / n_forward_interval; + norm += 0.50 / n_forward_interval; + n_bases += (intervals.get<2>(j+forward_pointer) - intervals.get<1>(j+forward_pointer))*0.5 / n_forward_interval; + } + forward_pointer++; + } + // Normalization 0.5+0.5+0.25+0.25 + loc_chr_count /= norm; + chr_counts_int.get(j*N_samples + i) = (double)loc_chr_count / n_bases; + } + } + + // Retransform count + for (int j = 0 ; j < intervals.size() ; j++) { + for (int i = 0 ; i < N_samples ; i++) { + counts.get(N_samples*j + i) /= (intervals.get<2>(j) - intervals.get<1>(j)); + } + } +} + +std::ofstream create_call_tsv_file_with_header(std::string name, + openfpm::vector<std::string> & files_bams) { + std::ofstream fout; + + auto & v_cl = create_vcluster(); + + if (v_cl.rank() == 0) { + // Open a TSV file for writing + fout.open(name); + if (!fout.is_open()) { + std::cerr << "Failed to open the file for writing.\n"; + return fout; + } + fout << "N" << '\t' << "chromosome" << '\t' << "start" << '\t' << "stop"; + for (int j = 0 ; j < files_bams.size() ; j++) { + fout << '\t' << basename(files_bams.get(j)); + } + fout << '\t' << "value" << '\t' << "CALL" << std::endl; + } + + return fout; +} + +std::ofstream create_call_CN_prob(std::string name, + openfpm::vector<std::string> & files_bams) { + std::ofstream fout; + + auto & v_cl = create_vcluster(); + + if (v_cl.rank() == 0) { + // Open a TSV file for writing + fout.open(name); + if (!fout.is_open()) { + std::cerr << "Failed to open the file for writing.\n"; + return fout; + } + fout << "N" << '\t' << "chromosome" << '\t' << "start" << '\t' << "stop"; + for (int j = 0 ; j < files_bams.size() ; j++) { + fout << '\t' << basename(files_bams.get(j)); + } + } + + return fout; +} + +template<unsigned int N_int_opt, unsigned int N_sample_opt> +void initialize_domain(Box<N_int_opt+N_sample_opt,double> & domain) { + for (size_t i = 0 ; i < N_int_opt ; i++) + { + domain.setLow(i,0.0); + domain.setHigh(i,1.0); + } + + for (size_t i = N_int_opt ; i < N_int_opt + N_sample_opt ; i++) + { + domain.setLow(i,0.0); + domain.setHigh(i,6.0); + } +} + +template<unsigned int N_int_opt, unsigned int N_sample_opt> +int call_single_interval_mean_depth(openfpm::vector<aggregate<short int,int,int>> & intervals, + openfpm::vector<std::string> & files_bams, + openfpm::vector<int> & counts, + openfpm::vector<aggregate<float[N_sample_opt+2]>> & solutions, + openfpm::vector<aggregate<float[N_sample_opt]>> & confidence){ + auto & v_cl = create_vcluster(); + + constexpr int dim = N_sample_opt + N_int_opt; + Box<dim,double> domain; + + initialize_domain<N_int_opt,N_sample_opt>(domain); + + int N_samples = N_sample_opt; + int N_chr = 24; + + openfpm::vector<int> chr_counts; + + #ifndef SKIP_LOAD + + initialize_counts(counts,chr_counts,intervals,files_bams,N_samples,N_chr); + + counts.save("counts_vector"); + chr_counts.save("chr_counts_vector"); + intervals.save("intervals_vector"); + + #endif + + #ifdef SKIP_LOAD + + counts.load("counts_vector"); + chr_counts.load("chr_counts_vector"); + intervals.load("intervals_vector"); + + #endif + + solutions.resize(intervals.size()); + confidence.resize(intervals.size()); + + if (v_cl.rank() == 0) { + std::cout << "CALLING ON EVERY INTERVALS BY MEAN DEPTH" << std::endl; + } + + if (v_cl.rank() == 0) { + + for (int j = 0 ; j < N_chr ; j++) { + std::cout << "chr " << j << " "; + for (int i = 0 ; i < N_samples ; i++) { + std::cout << " " << chr_counts.get(N_samples*j + i); + } + std::cout << std::endl; + } + } + + int n_interval_to_process = 0; + for (int i = 0 ; i < intervals.size() ; i++) { + bool is_zero = true; + for (int j = 0 ; j < N_samples; j++) { + if (counts.get(N_samples*i + j) != 0) { + is_zero = false; + } + } + if (is_zero == true) {continue;} + n_interval_to_process++; + } + + std::ofstream fout = create_call_tsv_file_with_header("calls.tsv",files_bams); + std::ofstream fout_CN_prob = create_call_CN_prob("calls_CN_probs.tsv",files_bams); + + openfpm::vector<openfpm::vector<double>> window_noise; + openfpm::vector<int> window_noise_pointer; + window_noise.resize(N_sample_opt); + window_noise_pointer.resize(N_sample_opt); + for (int i = 0 ; i < N_sample_opt; i++) { + window_noise.get(i).resize(30); + window_noise_pointer.get(i) = 0; + } + int interval_processed = 0; + for (int i = 0 ; i < intervals.size() ; i++) { + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << "---------------------------------------------------------------------------------------------------" << std::endl; + #endif + } + float reg = 2000; + int selected_interval = i; + double fishing_left = 0.5; + double fishing_right = 0.5; + + int chromosome = intervals.template get<0>(i); + + auto lamb_eval = [&](Eigen::VectorXd & vars){ + return model_function<true,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); + }; + + auto lamb = [&](Eigen::VectorXd & vars){ + return model_function<false,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); + }; + + // Check there is data + + bool is_zero = true; + for (int j = 0 ; j < N_samples; j++) { + if (counts.get(N_samples*i + j) != 0) { + is_zero = false; + } + } + if (is_zero == true) {continue;} + + openfpm::vector<double> sol; + openfpm::vector<double> confidence; + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << "Processing interval: " << interval_processed << "/" << n_interval_to_process << std::endl; + std::cout << "Chromosome: " << intervals.get<0>(i)+1 << " start: " << intervals.get<1>(i) << " stop: " << intervals.get<2>(i) << std::endl; + #endif + } + if (v_cl.rank() == 0) { + int chromosome = intervals.template get<0>(i); + if (i == 110000) { + int debug = 0; + debug++; + } + optimize_fast<N_int_opt,N_sample_opt>(fout_CN_prob,domain,0.0,sol,confidence,counts,chr_counts,window_noise,window_noise_pointer,i,chromosome,intervals.size()); + #ifndef NO_PRINT + for (int k = 0 ; k < sol.size() ; k++) { + std::cout << sol.get(k) << '\t'; + } + #endif + Eigen::VectorXd vars; + vars.resize(sol.size()); + for (int s = 0 ; s < sol.size() ; s++) { + vars(s) = sol.get(s); + } + #ifndef NO_PRINT + if (vars(0) > 0.0) { + std::cout << " f: " << lamb(vars) << std::endl; + } else { + std::cout << " f: " << "TOO LOO QUALITY FOR CALL " << std::endl; + } + std::cout << std::endl; + #endif + } + //optimize(domain,0.0,sol,lamb); + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << std::endl; + #endif + } + interval_processed++; + bool is_call = false; + fout << i << '\t' << intervals.get<0>(i)+1 << '\t' << intervals.get<1>(i) << '\t' << intervals.get<2>(i); + for (int k = 0 ; k < sol.size() ; k++) { + solutions.template get<0>(i)[k] = sol.get(k); + if (k != 0 && std::round(solutions.template get<0>(i)[k]) != 2) { + is_call = true; + } + if (v_cl.rank() == 0) + {fout << '\t' << solutions.template get<0>(i)[k];} + } + Eigen::VectorXd vars; + vars.resize(sol.size()); + for (int s = 0 ; s < sol.size() ; s++) { + vars(s) = sol.get(s); + } + + // Print confidence + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << "CONFIDENCE: "; + for (int i = N_int_opt ; i < N_int_opt + N_sample_opt ; i++) { + std::cout << confidence.get(2*i) << "(" << confidence.get(2*i+1) << ")" << " "; + } + std::cout << std::endl; + #endif + } + + if (v_cl.rank() == 0) { + if (sol.get(0) >= 0.0) + {fout << '\t' << lamb(vars) << '\t' << i;} + else + {fout << '\t' << "NO QUALITY" << '\t' << i;} + } + if (v_cl.rank() == 0) { + if (is_call == true) { + #ifndef NO_PRINT + std::cout << "----------------------------------------------------------------------------------------" << std::endl; + std::cout << "-----------------------------------CALL-------------------------------------------------" << std::endl; + std::cout << "----------------------------------------------------------------------------------------" << std::endl; + #endif + fout << '\t' << "CALL" << std::endl; + } + else { + fout << '\t' << "." << std::endl; + #ifndef NO_PRINT + std::cout << std::endl; + #endif + } + for (int j = 0 ; j < N_sample_opt ; j++) { + #ifndef NO_PRINT + std::cout << basename(files_bams.get(j)) << " " << solutions.template get<0>(i)[1+j] << " "; + #endif + } + } + fout << std::flush; + } + + return 0; +} + +template<unsigned int N_int_opt, unsigned int N_sample_opt> +int call_single_interval_mean_depth_local(openfpm::vector<aggregate<short int,int,int>> & intervals, + openfpm::vector<std::string> & files_bams, + openfpm::vector<int> & counts, + openfpm::vector<aggregate<float[N_sample_opt+2]>> & solutions, + openfpm::vector<aggregate<float[N_sample_opt+2]>> & solutions_out, + openfpm::vector<aggregate<float[N_sample_opt]>> & confidence){ + openfpm::vector<int> chr_counts; + solutions_out.resize(intervals.size()); + confidence.resize(intervals.size()); + + constexpr int dim = N_sample_opt + N_int_opt; + + int N_samples = N_sample_opt; + int N_chr = 24; + chr_counts.resize(N_chr*N_samples); + openfpm::vector<int> chr_counts_int; + + initialize_counts_chr_counts_int<N_sample_opt,N_int_opt>(chr_counts_int,counts,solutions,intervals); + + Box<dim,double> domain; + + initialize_domain<N_int_opt,N_sample_opt>(domain); + + auto & v_cl = create_vcluster(); + + if (v_cl.rank() == 0) { + std::cout << "CALLING ON EVERY INTERVALS BY MEAN DEPTH LOCAL" << std::endl; + } + + std::ofstream fout = create_call_tsv_file_with_header("calls_loc.tsv",files_bams); + std::ofstream fout_CN_prob = create_call_CN_prob("calls_CN_probs_loc.tsv",files_bams); + + openfpm::vector<openfpm::vector<double>> window_noise; + openfpm::vector<int> window_noise_pointer; + window_noise.resize(N_sample_opt); + window_noise_pointer.resize(N_sample_opt); + for (int i = 0 ; i < N_sample_opt; i++) { + window_noise.get(i).resize(30); + window_noise_pointer.get(i) = 0; + } + int interval_processed = 0; + for (int i = 0 ; i < intervals.size() ; i++) { + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << "---------------------------------------------------------------------------------------------------" << std::endl; + #endif + } + float reg = 2000; + int selected_interval = i; + double fishing_left = 0.5; + double fishing_right = 0.5; + + int chromosome = intervals.template get<0>(i); + + // Check there is data + + bool is_zero = true; + for (int j = 0 ; j < N_samples; j++) { + if (counts.get(N_samples*i + j) != 0) { + is_zero = false; + } + } + if (is_zero == true) {continue;} + + auto lamb = [&](Eigen::VectorXd & vars){ + return model_function<false,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); + }; + + openfpm::vector<double> sol; + openfpm::vector<double> confidence; + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << "Processing interval: " << interval_processed << std::endl; + std::cout << "Chromosome: " << intervals.get<0>(i)+1 << " start: " << intervals.get<1>(i) << " stop: " << intervals.get<2>(i) << std::endl; + #endif + } + if (v_cl.rank() == 0) { + int chromosome = intervals.template get<0>(i); + + // set chr_count_loc + for (int j = 0 ; j < N_sample_opt ; j++) { + chr_counts.get(chromosome*N_sample_opt + j) = chr_counts_int.get(N_sample_opt*i+j); + } + if (i == 53295) { + int debug = 0; + debug++; + } + optimize_fast<N_int_opt,N_sample_opt>(fout_CN_prob,domain,0.0,sol,confidence,counts,chr_counts,window_noise,window_noise_pointer,i,chromosome,intervals.size()); + for (int k = 0 ; k < sol.size() ; k++) { + #ifndef NO_PRINT + std::cout << sol.get(k) << '\t'; + #endif + } + Eigen::VectorXd vars; + vars.resize(sol.size()); + for (int s = 0 ; s < sol.size() ; s++) { + vars(s) = sol.get(s); + } + #ifndef NO_PRINT + if (vars(0) > 0.0) { + std::cout << " f: " << lamb(vars) << std::endl; + } else { + std::cout << " f: " << "TOO LOO QUALITY FOR CALL " << std::endl; + } + std::cout << std::endl; + #endif + } + v_cl.barrier(); + //optimize(domain,0.0,sol,lamb); + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << std::endl; + #endif + } + interval_processed++; + bool is_call = false; + fout << intervals.get<0>(i)+1 << '\t' << intervals.get<1>(i) << '\t' << intervals.get<2>(i); + for (int k = 0 ; k < sol.size() ; k++) { + solutions_out.template get<0>(i)[k] = sol.get(k); + if (k != 0 && std::round(solutions_out.template get<0>(i)[k]) != 2) { + is_call = true; + } + if (v_cl.rank() == 0) + {fout << '\t' << solutions_out.template get<0>(i)[k];} + } + Eigen::VectorXd vars; + vars.resize(sol.size()); + for (int s = 0 ; s < sol.size() ; s++) { + vars(s) = sol.get(s); + } + + // Print confidence + if (v_cl.rank() == 0) { + #ifndef NO_PRINT + std::cout << "CONFIDENCE: "; + for (int i = N_int_opt ; i < N_int_opt + N_sample_opt ; i++) { + std::cout << confidence.get(2*i) << "(" << confidence.get(2*i+1) << ")" << " "; + } + std::cout << std::endl; + #endif + } + + if (v_cl.rank() == 0) { + if (sol.get(0) >= 0.0) + {fout << '\t' << lamb(vars) << '\t' << i;} + else + {fout << '\t' << "NO QUALITY" << '\t' << i;} + } + if (v_cl.rank() == 0) { + if (is_call == true) { + #ifndef NO_PRINT + std::cout << "----------------------------------------------------------------------------------------" << std::endl; + std::cout << "-----------------------------------CALL-------------------------------------------------" << std::endl; + std::cout << "----------------------------------------------------------------------------------------" << std::endl; + #endif + fout << '\t' << "CALL" << std::endl; + } + else { + fout << '\t' << "." << std::endl; + #ifndef NO_PRINT + std::cout << std::endl; + #endif + } + #ifndef NO_PRINT + for (int j = 0 ; j < N_sample_opt ; j++) { + std::cout << basename(files_bams.get(j)) << " " << solutions_out.template get<0>(i)[1+j] << " "; + } + #endif + } + fout << std::flush; + } + + return 0; +} + + +std::vector<std::string> split(const std::string& input, const std::string& delimiter) { + std::vector<std::string> tokens; + size_t start = 0; + size_t end = input.find(delimiter); + + while (end != std::string::npos) { + tokens.push_back(input.substr(start, end - start)); + start = end + delimiter.length(); + end = input.find(delimiter, start); + } + + // Add the last token (or the only token if there's no delimiter) + tokens.push_back(input.substr(start)); + + return tokens; +} + +bool startsWith(const std::string& str, const std::string& prefix) { + if (str.length() < prefix.length()) { + return false; + } + return str.substr(0, prefix.length()) == prefix; +} + +void read_processed_intervals(std::string bed_f, openfpm::vector<aggregate<short int,int, int>> & intervals, openfpm::vector<float> & values, openfpm::vector<aggregate<float[16]>> & solutions) { + std::ifstream procIntervals(bed_f.c_str()); + + if (!procIntervals.is_open()) { + std::cerr << "Error: Unable to open the BED file." << std::endl; + return; + } + + std::string line; + + int next = 0; + + char str[256]; + + while (std::getline(procIntervals, line)) { + if (line[0] == 0) {continue;} + if (line[0] == '-') {continue;} + + int chr_int; + int start; + int stop; + + if (startsWith(line,"Chromosome:")) { + std::string sstart = "start:"; + std::vector<std::string> v = split(line,sstart); + std::string schr = "Chromosome:"; + std::vector<std::string> v2 = split(v[0],schr); + std::string sstop = "stop:"; + std::vector<std::string> v3 = split(v[1],sstop); + + chr_int = std::atoi(v2[1].c_str())-1; + start = std::atoi(v3[0].c_str()); + stop = std::atoi(v3[1].c_str()); + + intervals.add(); + intervals.last().get<0>() = chr_int; + intervals.last().get<1>() = start; + intervals.last().get<2>() = stop; + } + + if (startsWith(line,"Best solution:")) { + std::vector<std::string> v = split(line,"solution:"); + std::vector<std::string> v3 = split(v[1],"with"); + float value = std::atof(v3[0].c_str()); + values.add(value); + } + + if (next == 1) + { + next = -1; + std::istringstream iss(line); + + solutions.add(); + + int i = 0; + while (iss) { + iss >> solutions.last().get<0>()[i]; + i++; + } + } + if (startsWith(line,"at")) { + next = 1; + } + } + + procIntervals.close(); +} + + + +struct BreakPoint { + short int chr; + int pos; + int count; + int depth = 0; + int operation = 0; + + BreakPoint() {} + + BreakPoint(const short int & chr, const int & pos, const int & count) + :chr(chr),pos(pos),count(count) + {} + + bool operator<(const BreakPoint & b) const { + if (chr < b.chr) return true; + if (chr > b.chr) return false; + if (pos < b.pos) return true; + + return false; + } +}; + +void break_intervals(openfpm::vector<aggregate<short int,int,int>> & intervals, openfpm::vector<aggregate<short int, int, int, int, int>> & breaks) { + int j = 0; + + if (breaks.size() == 0) {return;} + + openfpm::vector<openfpm::vector<aggregate<short int,int,int>>> intervals_tmp; + intervals_tmp.resize(intervals.size()); + + for (int i = 0 ; i < intervals.size() ; i++) { + intervals_tmp.get(i).add(intervals.get(i)); + } + + for (int i = 0 ; i < intervals.size() && j < breaks.size() ; ) { + + short int chrom_b = breaks.template get<0>(j); + int start_b = breaks.template get<1>(j); + + short int chrom_i = intervals.template get<0>(i); + int start_i = intervals.template get<1>(i); + int stop_i = intervals.template get<2>(i); + + if (chrom_b > chrom_i) { + i++; + continue; + } + + if (chrom_i > chrom_b) { + j++; + continue; + } + + if (start_b >= stop_i) { + i++; + continue; + } + + if (start_b <= start_i) { + j++; + continue; + } + + // if the breakpoint is 50 bases near to the start or stop of the interval skip it + //if (fabs(start_i - start_b) <= 50) {j++ ; continue;} + //if (fabs(start_b - stop_i) <= 50) {j++; continue;} + + int last = intervals_tmp.get(i).size() - 1; + short int chrom_b_last = intervals_tmp.get(i).get<0>(last); + int start_i_last = intervals_tmp.get(i).get<1>(last); + int stop_i_last = intervals_tmp.get(i).get<2>(last); + intervals_tmp.get(i).remove(last); + intervals_tmp.get(i).add(); + intervals_tmp.get(i).get<0>(last) = chrom_b_last; + intervals_tmp.get(i).get<1>(last) = start_i_last; + intervals_tmp.get(i).get<2>(last) = start_b; + intervals_tmp.get(i).add(); + intervals_tmp.get(i).get<0>(last+1) = chrom_b; + intervals_tmp.get(i).get<1>(last+1) = start_b; + intervals_tmp.get(i).get<2>(last+1) = stop_i_last; + j++; + } + + + intervals.clear(); + + for (int i = 0 ; i < intervals_tmp.size() ; i++) { + for (int j = 0 ; j < intervals_tmp.get(i).size() ; j++ ) { + intervals.add(intervals_tmp.get(i).get(j)); + } + } +} + + +void find_breakpoint(openfpm::vector<BreakPoint> & brk, short int chrom_t, int start_t , int stop_t, int & b1, int & b2) { + int i = 0; + int end = brk.size() - 1; + + while (end != i) { + int mid = (end - i) / 2 + i; + short int mid_chromosome = brk.get(mid).chr; + int mid_pos = brk.get(mid).pos; + + if (chrom_t < mid_chromosome) {end = mid-1; i = (end > i)?i:end ;continue;} + if (chrom_t > mid_chromosome) {i = mid+1; end = (end > i)?end:i ;continue;} + + if (start_t > mid_pos) {i = mid+1; end = (end > i)?end:i ;continue;} + if (start_t < mid_pos) {end = mid; ;continue;} + + break; + } + + b1 = i; + + i = 0; + end = brk.size() - 1; + + while (end != i) { + int mid = (end - i) / 2 + i; + short int mid_chromosome = brk.get(mid).chr; + int mid_pos = brk.get(mid).pos; + + if (chrom_t < mid_chromosome) {end = mid-1; i = (end > i)?i:end ;continue;} + if (chrom_t > mid_chromosome) {i = mid+1; (end > i)?end:i; continue;} + + if (stop_t > mid_pos) {i = mid + 1; end = (end > i)?end:i ; continue;} + if (stop_t < mid_pos) {end = mid; continue;} + + break; + } + + b2 = i; +} + +template <typename T, typename U> +struct PairHash { + std::size_t operator () (const std::pair<T, U> &p) const { + // Combine the two integers into a single hash value + std::size_t h1 = std::hash<T>{}(p.first); + std::size_t h2 = std::hash<U>{}(p.second); + return h1 ^ h2; // You can choose a different combining function if needed + } +}; + +void read_break_points(std::string & bam_file, + openfpm::vector<aggregate<short int,int,int, int, int>> & break_points_out, + int N_chr, + openfpm::vector<int> & chr_ins) { + + std::unordered_map<std::pair<short int, int>,int, PairHash<short,int>> break_points; + openfpm::vector<BreakPoint> brk; + + std::cout << "READING Breakpoint for: " << bam_file << std::endl; + + openfpm::vector<openfpm::vector<int>> chr_ins_tmp; + chr_ins.resize(N_chr); + chr_ins_tmp.resize(N_chr); + for (int i = 0 ; i < N_chr ; i++) { + chr_ins_tmp.get(i).reserve(33554432); + } + + // Read the presence of break points + + int actual_chr = -1; + + { + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + + //if (aln->core.qual < 20) {continue;} + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + if (abs(aln->core.isize) > 0) { + chr_ins_tmp.get(chr_int).add(abs(aln->core.isize)); + } + + if (chr_int > actual_chr) { + actual_chr = chr_int; + std::cout << "Reading chromosome: " << actual_chr+1 << std::endl; + } + + for (int k = 0 ; k < n_cigar ; k++) { + int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + if (op == 'S') { + break_points[std::make_pair(chr_int, pos)] += 1; + pos += len; + } else if (op == 'D') { + if (len < 50) {continue;} + break_points[std::make_pair(chr_int, pos)] += 1; + break_points[std::make_pair(chr_int, pos+len)] += 1; + pos += len; + } else if (op == 'I') { + if (len < 50) {continue;} + break_points[std::make_pair(chr_int, pos)] += 1; + break_points[std::make_pair(chr_int, pos+len)] += 1; + pos += len; + } + else { + pos += len; + } + } + } + + bam_destroy1(aln); + sam_close(fp_in); + } + + // sort all insert size eliminate the outliers (potential deletions) + for (int i = 0 ; i < N_chr ; i++) { + chr_ins_tmp.get(i).sort(); + int percent10 = chr_ins_tmp.get(i).size() * 0.10; + int percent90 = chr_ins_tmp.get(i).size() * 0.90; + + size_t tot_insert = 0; + int n_insert = 0; + // Average + for (int j = percent10 ; j < percent90 ; j++) { + tot_insert += chr_ins_tmp.get(i).get(j); + n_insert++; + } + + chr_ins.get(i) = tot_insert / n_insert; + } + + std::cout << "SORTING BREAKPOINT" << std::endl; + + for (auto it = break_points.begin() ; it != break_points.end() ; ++it) { + if ((*it).second < 3) {continue;} + brk.add(BreakPoint((*it).first.first,(*it).first.second,(*it).second)); + } + + brk.sort(); + + actual_chr = -1; + std::cout << "CALCULATING Breakpoint depth: " << bam_file << std::endl; + + { + // Calculate the depth on the break point + + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + if (chr_int > actual_chr) { + actual_chr = chr_int; + std::cout << "Reading chromosome: " << actual_chr+1 << std::endl; + } + + // Fill depth + for (int k = 0 ; k < n_cigar ; k++) { + int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + if (op == 'M') { + int b1 = 0; + int b2 = 0; + find_breakpoint(brk,chr_int,pos,pos+len,b1,b2); + for (int i = b1 ; i < b2 ; i++) { + brk.get(i).depth++; + } + } + else { + pos += len; + } + } + } + + bam_destroy1(aln); + sam_close(fp_in); + } + + // valid break are all points + + // Copy to the output + + openfpm::vector<int> remove_list; + do { + remove_list.clear(); + for (int i = 0 ; i < brk.size() ; i++) { + int k = i-10; + k = (k < 0)?0:k; + + for (; k < brk.size() && fabs(brk.get(i).pos - brk.get(k).pos) < 100 ; k++) { + if (k == i) {continue;} + if (brk.get(i).count < brk.get(k).count) { + remove_list.add(i); + break; + } + } + } + + brk.remove(remove_list); + } while (remove_list.size() != 0); + + + openfpm::vector<int> id_to_remove; + // Filter breakpoints by depth + for (int i = 0 ; i < brk.size() ; ++i) { + if (brk.get(i).depth*0.1 >= 3 && brk.get(i).depth*0.1 > brk.get(i).count) { + id_to_remove.add(i); + } + } + brk.remove(id_to_remove); + + // Add events to brk + + { + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + + if (aln->core.qual < 20) {continue;} + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + if (chr_int > actual_chr) { + actual_chr = chr_int; + std::cout << "Reading chromosome: " << actual_chr+1 << std::endl; + } + + for (int k = 0 ; k < n_cigar ; k++) { + int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + if (op == 'S') { + if (len < 50) {continue;} + // find breakpoints + int b1 = 0; + int b2 = 0; + find_breakpoint(brk,chr_int,pos,pos+len+10,b1,b2); + for (int i = b1 ; i < b2 ; i++) { + brk.get(i).operation = 1; + } + pos += len; + } else if (op == 'D') { + if (len < 50) {continue;} + // find breakpoints + int b1 = 0; + int b2 = 0; + find_breakpoint(brk,chr_int,pos,pos+len+10,b1,b2); + for (int i = b1 ; i < b2 ; i++) { + brk.get(i).operation = 1; + } + pos += len; + } else if (op == 'I') { + if (len < 50) {continue;} + pos += len; + } + else { + pos += len; + } + } + } + + bam_destroy1(aln); + sam_close(fp_in); + } + + for (int i = 0 ; i < brk.size() ; i++) { + break_points_out.add(); + break_points_out.last().get<0>() = brk.get(i).chr; + break_points_out.last().get<1>() = brk.get(i).pos; + break_points_out.last().get<2>() = brk.get(i).count; + break_points_out.last().get<3>() = brk.get(i).depth; + break_points_out.last().get<4>() = brk.get(i).operation; + } +} + +unsigned char get_bits_from_base_char(char base) { + if (base == 'A') return 0x00; + if (base == 'C') return 0x01; + if (base == 'T') return 0x02; + if (base == 'G') return 0x03; + if (base == 'N') return 0x04; + + std::cout << "ERROR BASE" << std::endl; + return 0xFF; +} + +void read_fasta(const std::string & filename, openfpm::vector<openfpm::vector<char>> & chr_sequences) { + std::cout << "Read reference file: " << filename << std::endl; + + seqan3::sequence_file_input fin{filename.c_str()}; + + using record_type = decltype(fin)::record_type; + + int chr_int = 0; + + // You can use a for loop: + for (auto & record : fin) + { + //std::cout << record << std::endl; + std::cout << record.id() << std::endl; + auto & seq = record.sequence(); + chr_sequences.add(); + auto & last_sequence = chr_sequences.last(); + last_sequence.resize(seq.size() / 2 + seq.size() % 2); + for (int j = 0 ; j < seq.size() ; j+=2) { + char bases2 = 0; + + if (chr_int == 0 && j == 4356168) { + int debug = 0; + debug++; + } + + char base = seqan3::to_char(seq[j]); + bases2 |= (base == 'A')?0x00:bases2; + bases2 |= (base == 'C')?0x01:bases2; + bases2 |= (base == 'T')?0x02:bases2; + bases2 |= (base == 'G')?0x03:bases2; + bases2 |= (base == 'N')?0x04:bases2; + + base = seqan3::to_char(seq[j+1]); + bases2 |= (base == 'A')?0x00:bases2; + bases2 |= (base == 'C')?0x10:bases2; + bases2 |= (base == 'T')?0x20:bases2; + bases2 |= (base == 'G')?0x30:bases2; + bases2 |= (base == 'N')?0x40:bases2; + + last_sequence.get(j / 2) = bases2; + } + chr_int++; + } +} + +struct call_vcf { + short int chromosome; + int start; + int stop; + int CN; + int num_region; + int precise_start; + int precise_stop; + float quality; + openfpm::vector<float> efficiency; + openfpm::vector<float> residual; +}; + +void read_CNV_hypothesys(const std::string & filename, openfpm::vector<call_vcf> & cnvs_to_test) { + // Initialize VCF header and writer + htsFile* vcfFile = hts_open(filename.c_str(), "r"); + if (vcfFile == nullptr) { + std::cerr << "Error: Cannot open output VCF file" << std::endl; + return; + } + + // Read and print VCF records + bcf_hdr_t* header = bcf_hdr_read(vcfFile); + bcf1_t* record = bcf_init(); + + while (bcf_read(vcfFile, header, record) == 0) { + bcf_unpack(record, BCF_UN_ALL); + + for (int i = 1; i < record->n_allele; ++i) { + if (strcmp(record->d.allele[i],"<DEL>") == 0) { + struct call_vcf vcfc; + + vcfc.chromosome = chr_to_chr_int(bcf_hdr_id2name(header, record->rid)); + vcfc.start = record->pos-1; // Convert to 0 based index + vcfc.stop = record->pos + record->rlen-1; + vcfc.CN = 1; + + cnvs_to_test.add(vcfc); + } + + if (strcmp(record->d.allele[i],"<DUP>") == 0) { + struct call_vcf vcfc; + + vcfc.chromosome = chr_to_chr_int(bcf_hdr_id2name(header, record->rid)); + vcfc.start = record->pos - 1; // Convert to 0 based index + vcfc.stop = record->pos + record->rlen-1; + vcfc.CN = 3; + + cnvs_to_test.add(vcfc); + } + } + } + + // Clean up + bcf_hdr_destroy(header); + bcf_destroy(record); + bcf_close(vcfFile); +} + +struct CNV_event_link { + int call_id; + int padding_left; + int padding_right; + int pos_seed; + char path; +}; + + +void print_bases_size_t(size_t bases) { + for (int i = 0 ; i < 32 ; i++) { + if ((bases & 0x03) == 0) {std::cout << 'A';} + if ((bases & 0x03) == 1) {std::cout << 'C';} + if ((bases & 0x03) == 2) {std::cout << 'T';} + if ((bases & 0x03) == 3) {std::cout << 'G';} + bases = bases >> 2; + } + std::cout << std::endl; +} + +void print_1_base_right(char base) { + base = base >> 4; + + if ((base & 0x03) == 0) {std::cout << 'A';} + if ((base & 0x03) == 1) {std::cout << 'C';} + if ((base & 0x03) == 2) {std::cout << 'T';} + if ((base & 0x03) == 3) {std::cout << 'G';} +} + +void print_1_base_left(char base) { + base = base & 0x3; + + if ((base & 0x03) == 0) {std::cout << 'A';} + if ((base & 0x03) == 1) {std::cout << 'C';} + if ((base & 0x03) == 2) {std::cout << 'T';} + if ((base & 0x03) == 3) {std::cout << 'G';} +} + +void print_2_bases(char base) { + print_1_base_left(base); + print_1_base_right(base); +} + +char get_base_from_reference(openfpm::vector<openfpm::vector<char>> & reference, short int chr, int pos) { + int pos_r = pos / 2; + int pos_rest = pos % 2; + + char base_bit = (reference.get(chr).get(pos_r) >> pos_rest*4) & 0x7; + + if (base_bit == 0) return 'A'; + if (base_bit == 1) return 'C'; + if (base_bit == 2) return 'T'; + if (base_bit & 0x4) return 'N'; + + return 'G'; +} + +void collect_reads_for_call_evidence_forward(std::string & bam_file, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + openfpm::vector<call_vcf> & calls, + openfpm::vector<aggregate< // Calls + openfpm::vector<aggregate< // Sequences + openfpm::vector<aggregate<char,char>>,short int,int,int,int> + > + >> & seqs, + openfpm::vector<openfpm::vector<char>> & reference) { + + int n_reads = 0; + + char seq1[16384]; + char seq2[16384]; + char seq3[16384]; + + int actual_chr = -1; + + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + char * read_name = (char *)&aln->data[0]; + + uint8_t *q = bam_get_seq(aln); //quality string + uint32_t q2 = aln->core.qual ; + auto flag = aln->core.flag; + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + if (chr_int > actual_chr) { + actual_chr = chr_int; + std::cout << "Reading forward chromosome: " << actual_chr+1 << std::endl; + } + + size_t bases_seq = 0; + for (int i = 0 ; i < len ; i++) { + bases_seq = bases_seq << 2; + char base = seq_nt16_str[bam_seqi(q,i)]; + bases_seq |= (base == 'A')?0x00:0x00; + bases_seq |= (base == 'C')?0x01:0x00; + bases_seq |= (base == 'T')?0x02:0x00; + bases_seq |= (base == 'G')?0x03:0x00; + +/* if (bases_seq == 0x3e79e6d9944cfe6f) { + int debug = 0; + debug++; + } + } + + bases_seq = 0; + int i = 0; + for(i=len-1; i>0 ; i--){ + bases_seq = bases_seq << 2; + char base = seq_nt16_str[bam_seqi(q,i)]; + bases_seq |= (base == 'A')?0x00:0x00; + bases_seq |= (base == 'C')?0x01:0x00; + bases_seq |= (base == 'T')?0x02:0x00; + bases_seq |= (base == 'G')?0x03:0x00; + + if (bases_seq == 0x3e79e6d9944cfe6f) { + int debug = 0; + debug++; + }*/ + + if (i >= 32) { + auto fnd = map.find(bases_seq); + if (fnd != map.end()) + { + //std::cout << "FOUND PATTERN " << bam_get_qname(aln) << std::endl; + + // This seed can be associated to multiple calls, go across all of them + for (int v = 0 ; v < fnd->second.size() ; v++) { + int cid = fnd->second.get(v).call_id; + //std::cout << calls.get(cid).chromosome << " START: " << calls.get(cid).start << " STOP: " << calls.get(cid).stop << " CN: " << calls.get(cid).CN << std::endl; + + //print_bases_size_t(bases_seq); + + // Check the pattern + if (fnd->second.get(v).path == 0) { + // check the reference + int start = calls.get(cid).start + fnd->second.get(v).padding_left-i; + int seed_pos = i-32; + int call_distance_from_start = calls.get(cid).start - start; + + if (call_distance_from_start > len) {continue;} + + n_reads++; + + if (n_reads % 1000 == 0) {std::cout << "READS to SAVE " << n_reads << " " << chr_int << ":" << pos << std::endl;} + + // Smith–Waterman algorithm to compare the read with the reference + // and the read with the reference with the deletion performed + + for (int j = 0 ; j < len ; j++) { + seq2[j] = seq_nt16_str[bam_seqi(q,j)]; + seq1[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start+j); + if (start + j < calls.get(cid).start) { + seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start+j); + } else { + seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,calls.get(cid).stop+j-call_distance_from_start); + } + } + seq1[len] = 0; + seq2[len] = 0; + seq3[len] = 0; + + SequentialImplementation sw1(seq1,seq2,1,0,-1); + SequentialImplementation sw2(seq1,seq3,1,0,-1); + + sw1.runAlgorithm(); + sw2.runAlgorithm(); + + int best_sw1 = 0; + { + auto & results = sw1.getResults(); + + for(size_t j=0;j<results[0].size();j++){ + // std::cout << "Match " << j+1 << " [Score: " << results[0][j].score << ", Start: " << results[0][j].start << ", Stop: " << results[0][j].stop << "]" << endl; + // std::cout << " D: " << results[0][j].result_pair.d << endl; + // std::cout << " Q: " << results[0][j].result_pair.q << endl; + // std::cout << "SCORE: " << results[0][j].score << std::endl; + if (results[0][j].score > best_sw1) { + best_sw1 = results[0][j].score; + } + } + } + + int best_sw2 = 0; + { + auto & results = sw2.getResults(); + + for(size_t j=0;j<results[0].size();j++){ + // std::cout << "Match " << j+1 << " [Score: " << results[0][j].score << ", Start: " << results[0][j].start << ", Stop: " << results[0][j].stop << "]" << endl; + // std::cout << " D: " << results[0][j].result_pair.d << endl; + // std::cout << " Q: " << results[0][j].result_pair.q << endl; + // std::cout << "SCORE: " << results[0][j].score << std::endl; + if (results[0][j].score > best_sw2) { + best_sw2 = results[0][j].score; + } + } + } + + if (best_sw1 > len-3 || best_sw2 > len-3) { + seqs.get<0>(cid).add(); + int last = seqs.get<0>(cid).size() - 1; + seqs.get<0>(cid).get<0>(last).resize(len); + for (int s = 0 ; s < len ; s++) { + seqs.get<0>(cid).get<0>(last).template get<0>(s) = seq2[s]; + } + + if (seqs.get<0>(cid).get<0>(last).size() < aln->core.l_qname) { + seqs.get<0>(cid).get<0>(last).resize(aln->core.l_qname); + } + for (int s = 0 ; s < aln->core.l_qname ; s++) { + seqs.get<0>(cid).get<0>(last).template get<1>(s) = read_name[s]; + } + seqs.get<0>(cid).get<0>(last).template get<1>(aln->core.l_qname) = 0; + + seqs.get<0>(cid).get<1>(last) = chr_int; + seqs.get<0>(cid).get<2>(last) = start; + seqs.get<0>(cid).get<3>(last) = flag; + seqs.get<0>(cid).get<4>(last) = seed_pos; + + std::cout << "READ: " << read_name << " " << seq2 << " seed " << bases_seq << " pos " << seed_pos << std::endl; + } + } + } + + break; + } + } + } + + } + + bam_destroy1(aln); + sam_close(fp_in); + +} + +void collect_reads_for_call_evidence_reverse(std::string & bam_file, + openfpm::vector<aggregate< // Calls + openfpm::vector<aggregate< // Sequences + openfpm::vector<aggregate<char,char>>,short int,int,int,int> + > + >> & seqs) { + + int actual_chr = -1; + + samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + char * read_name = (char *)&aln->data[0]; + + // Remove duplicate sequences + + tsl::hopscotch_map<std::string, size_t> map_reads; + + for (int i = 0 ; i < seqs.size() ; i++) { + map_reads.clear(); + for (int j = 0 ; j < seqs.get<0>(i).size() ; j++) { + std::string str; + int k = 0; + while (seqs.get<0>(i).get<0>(j).get<1>(k) != 0) { + str.push_back(seqs.get<0>(i).get<0>(j).get<1>(k)); + k++; + } + + if (map_reads.find(str) != map_reads.end()) { + // found + seqs.get<0>(i).remove(j); + } + else { + map_reads[str] = 1; + } + } + } + + map_reads.clear(); + + for (int i = 0 ; i < seqs.size() ; i++) { + for (int j = 0 ; j < seqs.get<0>(i).size() ; j++) { + std::string str; + int k = 0; + while (seqs.get<0>(i).get<0>(j).get<1>(k) != 0) { + str.push_back(seqs.get<0>(i).get<0>(j).get<1>(k)); + k++; + } + + map_reads[str] = ((size_t)i << 32) + j; + } + } + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) + if (aln->core.tid == -1) + {continue;} + char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + uint32_t len = aln->core.l_qseq; //length of the read. + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + char * read_name = (char *)&aln->data[0]; + auto flag = aln->core.flag; + + uint8_t *q = bam_get_seq(aln); //quality string + uint32_t q2 = aln->core.qual ; + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + std::string str(read_name); + auto it = map_reads.find(str); + if (map_reads.find(str) != map_reads.end()) { + // found + + int j = it->second & 0xffffffff; + int i = it->second >> 32; + + // DEBUG + if (str == "A00684:261:HNJK2DRX2:1:2162:2880:9768") { + std::cout << "FOUND REVERSE: " << str << " i: " << i << " j:" << j << std::endl; + } + + if (seqs.get<0>(i).get<3>(j) != flag) { + if (seqs.get<0>(i).get<0>(j).size() < len) { + int old_len = seqs.get<0>(i).get<0>(j).size(); + seqs.get<0>(i).get<0>(j).resize(len); + seqs.get<0>(i).get<0>(j).get<0>(old_len) = -1; + } + + for (int k = 0 ; k < len ; k++) { + seqs.get<0>(i).get<0>(j).get<1>(k) = seq_nt16_str[bam_seqi(q,k)]; + } + } + } + + if (chr_int > actual_chr) { + actual_chr = chr_int; + std::cout << "Reading reverse chromosome: " << actual_chr+1 << std::endl; + } + } + + bam_destroy1(aln); + sam_close(fp_in); + +} + +void collect_reads_for_call_evidence(std::string & bam_file, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + openfpm::vector<call_vcf> & calls/**/, + openfpm::vector< + aggregate< // Calls + openfpm::vector< + aggregate< // Sequences + openfpm::vector< + aggregate<char,char> + >, + short int, + int, + int, + int + > + > + > + > & seqs, + openfpm::vector<openfpm::vector<char>> & reference) { + + // now we check all the read if they have a matching patterns + seqs.resize(calls.size()); + + collect_reads_for_call_evidence_forward(bam_file,map,calls,seqs,reference); + collect_reads_for_call_evidence_reverse(bam_file,seqs); + + // + std::cout << "COMPLETED " << bam_file << std::endl; +} + + + +void add_32bases_pattern(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + int call_id, + int padding) { + + // Store the patterns related to the reference + int chr = chrom; + int pos = start / 2; + int pos_rest = (start % 2)*2; + + size_t bases_pattern = 0; + + int loop_size = 16; + auto & seq = chr_sequences.get(chr); + if (pos_rest != 0) { + char base_bit = seq.get(pos) >> 4; + if (base_bit == 4) {return;} + bases_pattern |= base_bit; + loop_size = 15; + } + + int k = 0; + for ( ; k < loop_size ; k++) { + bases_pattern = bases_pattern << 2; + char base_bit = seq.get(pos + k + (pos_rest != 0)) & 0x7; + if (base_bit == 4) {return;} + bases_pattern |= ((size_t)base_bit); + base_bit = seq.get(pos + k + (pos_rest != 0)) >> 4; + bases_pattern = bases_pattern << 2; + if (base_bit == 4) {return;} + bases_pattern |= ((size_t)base_bit); + } + + if (pos_rest != 0) { + bases_pattern = bases_pattern << 2; + char base_bit = seq.get(pos + k + 1) & 0x7; + if (base_bit == 4) {return;} + bases_pattern |= ((size_t)base_bit); + } + + openfpm::vector<CNV_event_link> & calls = map[bases_pattern]; + + struct CNV_event_link cel; + cel.call_id = call_id; + cel.padding_left = padding; + cel.padding_right = padding; + cel.path = 0x0; + cel.pos_seed = start; + calls.add(cel); +} + +void get_sequence(openfpm::vector<openfpm::vector<char>> & reference, short int chr , int start, int stop, int p2, int p3, char * seq) { + if (p2 == -1) { + for (int i = start ; i < stop ; i++) { + seq[i-start] = get_base_from_reference(reference,chr,i); + } + seq[stop-start] = 0; + } else { + for (int i = start ; i < p2 ; i++) { + seq[i-start] = get_base_from_reference(reference,chr,i); + } + + for (int i = p3 ; i < stop ; i++) { + seq[i-p3 + (p2 - start)] = get_base_from_reference(reference,chr,i); + } + seq[stop - p3 + p2 - start] = 0; + } +} + +/*double calculate_event_probability(bam1_t * aln, + bam_hdr_t * bamHdr, + openfpm::vector<openfpm::vector<char>> & reference, + double reading_error, + int expected_insert_size, + openfpm::vector<int> & chr_ins) { + + int n_cigar = aln->core.n_cigar; // number of cigar operations + uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + int32_t pos = aln->core.pos +1; + uint8_t *q = bam_get_seq(aln); + + char *chr = bamHdr->target_name[aln->core.tid]; + + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {return 0.0;} + + double aligment_probability = 1.0; + int rpos = 0; + for (int k = 0 ; k < n_cigar ; k++) { + int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + if (op == 'M') { + for (int j = 0 ; j < len ; j++, rpos++) { + char base = seq_nt16_str[bam_seqi(q,rpos)]; + aligment_probability*=(get_base_from_reference(reference,chr_int,pos+j) == base)?(1.0-reading_error):reading_error; + } + } else { + for (int j = 0 ; j < len ; j++, rpos++) { + aligment_probability *= reading_error; + } + pos += len; + } + } + + return aligment_probability; +}*/ + +double calculate_event_probability_between_to_sequences(char * seq1, + char * seq2, + int len, + double reading_error) { + double align_probability = 1.0; + + for(size_t j=0;j<len;j++) { + align_probability += (seq1[j] == seq2[j])?(log(1.0-reading_error)):log(reading_error); + } + + return align_probability; +} + +double calculate_event_probability(char * seq1, + char * seq2, + char * seq3, + int len, + int expected_insert_size, + short int chr_int, + int pos, + double reading_error, + openfpm::vector<int> & chr_ins, + openfpm::vector<openfpm::vector<char>> & reference) { + + char seq4[16384]; + double align_probability = calculate_event_probability_between_to_sequences(seq1,seq2,len,reading_error); + + int max_i; + double max = -1.0e300; + for (int i = -4*chr_ins.get(chr_int) ; i < 4*chr_ins.get(chr_int) ; i++) { + // get the sequence + get_sequence(reference,chr_int,pos+i,pos+i+len,-1,-1,seq4); + + if (expected_insert_size != -1) { + double event_p = log(poisson_distribution(expected_insert_size,abs(i))) + calculate_event_probability_between_to_sequences(seq4,seq3,len,reading_error); + if (event_p > max) { + max = event_p; + max_i = i; + } + } else { + double event_p = log(poisson_distribution(chr_ins.get(chr_int),abs(i))) + calculate_event_probability_between_to_sequences(seq4,seq3,len,reading_error); + if (event_p > max) { + max = event_p; + max_i = i; + } + } + } + + std::cout << "MAX: " << max << " POS: " << pos << " i: " << max_i << std::endl; + return align_probability + max / 10.0; +} + +struct seed_pos{ + short int chr; + int pos; + int cid; +}; + +struct seed_id{ + size_t seed; + openfpm::vector<seed_pos> calls; + + bool operator<(const seed_id & tmp) const { + return seed < tmp.seed; + } +}; + +void get_reference_point(openfpm::vector<seed_id> & seed, + openfpm::vector<openfpm::vector<char>> & reference, + openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> & un_seed) { + seed.sort(); + + int latest_n_base = 0; + size_t seq = 0; + for (int i = 0 ; i < reference.size() ; i++) { + for (int j = 0 ; j < reference.get(i).size() ; j++) { + + char base = get_base_from_reference(reference,i,j); + if (base == 'N'){ + latest_n_base = 0; + } + seq |= get_bits_from_base_char(base); + if (latest_n_base < 32) { + seq = seq << 2; + latest_n_base++; + continue; + } + + // search the sequence + int k = 0; + int end = seed.size() - 1; + + while (end != k) { + int mid = (end - k) / 2 + k; + size_t mid_seed = seed.get(mid).seed; + + if (seq > mid_seed) {k = mid+1; end = (end > k)?end:k ;continue;} + if (seq < mid_seed) {end = mid; ;continue;} + + k = mid; + break; + } + + // Found + if (seed.get(k).seed == seq) { + for (int m = 0 ; m < seed.get(k).calls.size() ; m++) { + int cid = seed.get(k).calls.get(m).cid; + un_seed.get(cid).add(); + int last = un_seed.get(cid).size()-1; + un_seed.get(cid).get<0>(last) = i; + un_seed.get(cid).get<1>(last) = j-31; + un_seed.get(cid).get<2>(last) = k; + } + } + + seq = seq << 2; + latest_n_base++; + } + } +} + +typedef openfpm::vector<aggregate<char,char,char>> Pair_read_type; +typedef aggregate<Pair_read_type,short int, int, int, int> Relevant_read_type; +typedef openfpm::vector<Relevant_read_type> Call_related_relevant_read_type; +typedef openfpm::vector<aggregate<Call_related_relevant_read_type>> Sample_call_related_relevant_read_type; +typedef openfpm::vector<aggregate<Sample_call_related_relevant_read_type>> All_samples_call_related_relevant_read_type; + +Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & reference, + const Pair_read_type & seq, + openfpm::vector<aggregate<short int, int, int>> & un_seed, + call_vcf & call, + int padding, + short int chr, + int pos, + int distance_from_seed, + openfpm::vector<int> & chr_ins) { + Eigen::ArrayXd events; + events.resize((2*padding+1)*(2*padding+1) + un_seed.size()); + + // Event 1 reference + + char seq1[16384]; + char seq2[16384]; + char seq3[16384]; + + for (int i = 0 ; i < seq.size() ; i++) { + seq2[i] = seq.get<0>(i); + seq3[i] = seq.get<1>(i); + } + seq2[seq.size()] = 0; + seq3[seq.size()] = 0; + + //get_sequence(reference,chr,pos,pos+seq.size(),-1,-1,seq1); + //events(0) = calculate_event_probability(seq2,seq1,seq3,-1,chr,3e-3,chr_ins); + + std::cout << "Distance from seed: " << distance_from_seed << std::endl; + + // Event 2 break + + for (int i = -padding ; i <= padding ; i++) { + for (int j = -padding ; j <= padding ; j++) { + int id = (i+padding)*(2*padding+1) + (j+padding); + get_sequence(reference,chr,pos,pos+seq.size()+(call.stop+j-call.start-i),call.start+i,call.stop+j,seq1); + std::cout << "--------------- " << i << "," << j << " ---------------------------------" << std::endl; + events(id) = calculate_event_probability(seq2,seq1,seq3,seq.size(),call.stop-call.start,chr,pos,3e-3,chr_ins,reference); + + // Print + + for (int j = 0 ; j < seq.size() ; j++) { + if (j == distance_from_seed) { + std::cout << "*" << seq2[j]; + } else { + std::cout << seq2[j]; + } + } + std::cout << std::endl; + + for (int j = 0 ; j < seq.size() ; j++) { + if (j == distance_from_seed) { + std::cout << "*" << seq1[j]; + } else { + std::cout << seq1[j]; + } + } + std::cout << std::endl; + + std::cout << " Probability: " << events(id) << std::endl; + } + } + + // Event 3 unaligned + + // for all unaligned points (max 100) + for (int i = 0 ; i < un_seed.size() ; i++) { + get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i)-distance_from_seed+1,un_seed.get<1>(i)-distance_from_seed+1+seq.size(),-1,-1,seq1); + int id = (2*padding+1)*(2*padding+1) + i; + std::cout << "------------------- un_seed: " << i << " -----------------------------" << std::endl; + events(id) = calculate_event_probability(seq2,seq1,seq3,seq.size(),-1,chr,un_seed.get<1>(i)-distance_from_seed+1,3e-3,chr_ins,reference); + + // Print + + for (int j = 0 ; j < seq.size() ; j++) { + if (j == distance_from_seed) { + std::cout << "*" << seq2[j]; + } else { + std::cout << seq2[j]; + } + } + std::cout << std::endl; + + for (int j = 0 ; j < seq.size() ; j++) { + if (j == distance_from_seed) { + std::cout << "*" << seq1[j]; + } else { + std::cout << seq1[j]; + } + } + std::cout << std::endl; + + std::cout << " Probability: " << events(id) << std::endl; + } + + return events; +} + +// Function to generate a random permutation of numbers from 1 to N +void randomPermutation(int N, openfpm::vector<int> & permutation) { + permutation.resize(N); + + // Initialize the permutation with numbers from 1 to N + for (int i = 0; i < N; ++i) { + permutation.get(i) = i; + } + + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + + // Perform the Fisher-Yates shuffle + for (int i = N - 1; i > 0; --i) { + std::uniform_int_distribution<int> distribution(0, i); + int j = distribution(gen); + std::swap(permutation.get(i), permutation.get(j)); + } +} + +int ArgMax(const Eigen::Vector3d & v) { + int i_max = 0; + double v_max = v(0); + for (int i = 1 ; i < 3 ; i++) { + if (v_max > v(i) ) { + v_max = v(i); + i_max = i; + } + } + + return i_max; +} + + + +void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, + Call_related_relevant_read_type & seqs, + openfpm::vector<aggregate<short int, int, int>> & un_seed, + call_vcf & call, + int padding, + openfpm::vector<int> & chr_ins) { + // User viterbi algorithm to get the sequence of events + + // There are 6 events + + // 1 reference + // 2 Events <------ This is an array of events of size N + // 3 unaligned + + // The transition matrix is f where f is the frequency of the allele (because we do not do any assumption f = 0.5) + // 1 2.1 3 + // 1) 1/3*(1-f) 1/3N*f ... 1/3 + + // 2.1) 1/3 1/3N ... 1/3 + // 2.2) 1/3 1/3N ... 1/3 + + // 3) 1/3 1/3N ... 1/3 + + // The evidence Matrix is the probability of the read to support that evidence + + // We apply Viterbi algorithm to the sequence of randomly picked reads as described here: + // https://ocw.mit.edu/courses/16-410-principles-of-autonomy-and-decision-making-fall-2010/55e488318190f148c4ed211e2c96bada_MIT16_410F10_lec20.pdf + // Once we have the sequence + // We sum all the probabilities of the trajectory generated by the Viterbi algorithm divided by the the number the events + + // An event probability higher than 0.5 is considered an event + + int N = (2*padding+1)*(2*padding+1)+un_seed.size(); + + Eigen::MatrixXd transition_log(N,N); + Eigen::VectorXd Initial_log(N); // <------ Initial state + + // Initial state + for (int i = 0 ; i < (2*padding+1)*(2*padding+1) ; i++) { + Initial_log(i) = std::log(1.0f/N); + } + for (int i = (2*padding+1)*(2*padding+1) ; i < (2*padding+1)*(2*padding+1)+un_seed.size() ; i++) { + Initial_log(i) = std::log(1.0f/N); + } + + // Fill transition matrix with uniform distribution + for (int i = 0 ; i < N ; i++) { + for (int j = 0 ; j < N ; j++) { + transition_log(i,j) = std::log(1.0 / ((2*padding+1)*(2*padding+1)+un_seed.size())); + } + } + + openfpm::vector<int> permutation; + + openfpm::vector<double> events; + events.resize((2*padding+1)*(2*padding+1)+un_seed.size()); + + // create a random permutation + randomPermutation(seqs.size(),permutation); + + openfpm::vector<Eigen::ArrayXd> delta_log; + openfpm::vector<Eigen::ArrayXd> Pre; + delta_log.resize(seqs.size()); + Pre.resize(seqs.size()); + + int s = permutation.get(0); + + int seed_pos = seqs.get<4>(s); + delta_log.get(0).resize(Initial_log.size()); + + // Carefull the Evidence matrix return the natural logarithm of the probability, initial contain the logarithm of the initial probability Initial_log = log(Initial) + // and delta_log contain the logarim of the delta log(delta) + // so the formula delta = Initial*Evidence_matrix in logaritm space become + // delta_log = log(Initial*Evidemce_matrix) + // delta_log = Initial_log + Evidence_matrix_log + delta_log.get(0) = Initial_log.array() + Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins); + int x; + auto mC = delta_log.get(0).maxCoeff(&x); + std::cout << "BEST event: " << x << " " << mC << std::endl; + Pre.get(0).resize(Initial_log.size()); + Pre.get(0) = -1; + + for (int i = 1 ; i < seqs.size() ; i++) { + int s = permutation.get(i); + + Eigen::VectorXd next_transition(Initial_log.size()); + std::cout << "Initial Array: " << Initial_log.array() << std::endl; + std::cout << "Event probs: " << delta_log.get(i-1) << std::endl; + + // Here we tranform by the transition matrix (we do everything in logaritmic space) + // The formula below is the multiplication of the transition matrix by the previous delta (Viterbi) + // avoiding undeflow problems + for (int j = 0 ; j < N ; j++) { + + // First accumulation accum = transition(j,0)*delta(0) + // in log space + // accum_log = transition_log(j,0) + delta_log(0) + double accum_log = transition_log(j,0) + delta_log.get(i-1)(0); + + // we are trying to calculate accum = sum_over_k (transition(j,k)*delta(k)) + // in logarithmic space. + for (int k = 1 ; k < N ; k++) { // Classical Matix vector multiplication loop + + // accum = accum + transition(j,k)*delta(k) + // in log space: + // accum_log = log(accum + transition(j,k)*delta(k)) + // We split in two cases accum > transition(j,k)*delta(k), we also indicate accum -> a, transition(j,k) -> t, delta(k) -> d + // accum_log = log(a*(1+ t*d/a)) + // accum_log = log(a) + log(1+t*d/a) + // accum_log = log(a) + log(1+exp(log(t*d/a))) + // accum_log = log(a) + log(1+exp(log(t)+log(d)-log(a))) + // accum_log = log(a) + log(1+exp(-(log(a)-log(t)*log(d))) + + // In case t*d > accum we have + // accum_log = log(t*d) + log(1+exp(-(log(t)+log(d)-log(a))) + + // This can be rewritten in a single case as C math functions + + // accum_log = fmax(a,t_log+d_log) + log(1+exp(-fabs(t_log+d_log-a_log)))) + // or + // accum_log = fmax(a,t_log+d_log) + log1p(1+exp(-fabs(t_log+d_log-a_log)))) + + accum_log = fmax(accum_log,transition_log(j,k) + delta_log.get(i-1)(k)) + log1p(exp(-fabs(delta_log.get(i-1)(k) + transition_log(j,k) - accum_log))); + } + + std::cout << accum_log << std::endl; + + next_transition(j) = accum_log; + } + + int x; + double coeff = next_transition.maxCoeff(&x); + Eigen::ArrayXd eml = Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins); + auto mC = eml.maxCoeff(&x); + std::cout << "Best event: " << x << " " << mC << std::endl; + std::cout << "Next transition: " << eml << std::endl; + delta_log.get(i) = coeff + eml; + + if (i+1 < seqs.size()) { + Pre.get(i+1) = x; + } + } + + int s_star = ArgMax(delta_log.last()); + + // backtrace to find the most probable path + + openfpm::vector<int> event; + openfpm::vector<Eigen::Array3d> probs; + + event.resize(seqs.size()); + probs.resize(seqs.size()); + + int q_k_p1 = s_star; + + for (int i = seqs.size()-1 ; i > 0 ; i--) { + event.get(i) = Pre.get(i)(q_k_p1); + q_k_p1 = event.get(i); + } +} + +void CNV_event_test(openfpm::vector<call_vcf> & calls, + openfpm::vector<openfpm::vector<char>> & reference, + openfpm::vector<std::string> & files_bams, + int padding, + openfpm::vector<aggregate<openfpm::vector<int>>> & chr_ins) { + char seq1[16384]; + char seq2[16384]; + char seq3[16384]; + + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> map; + + map.reserve(calls.size()*2); + + for (int i = 0 ; i < calls.size() ; i ++) { + + if (i % 1000 == 0) { + std::cout << "Creating Map: " << i << "/" << calls.size() << std::endl; + } + + int offset = std::min(calls.get(i).start - 32 , 0); + + // Store the patterns related to the reference + for (int j = -padding ; j < padding ; j++ ) { + if (calls.get(i).start-32+j-offset < 0) {continue;} + + // DEBUG + char seq[64]; + get_sequence(reference,calls.get(i).chromosome,calls.get(i).start-32+j-offset,calls.get(i).start+j-offset,-1,-1,seq); + seq[33] = 0; + std::cout << "ADDING sequence at " << calls.get(i).chromosome << ":" << calls.get(i).start-32+j-offset << " SEQUENCE: " << seq << std::endl; + + + add_32bases_pattern(calls.get(i).chromosome,calls.get(i).start-32+j-offset, + reference,map,i,-32+j-offset); + } + + // Store the patterns related to the stop + for (int j = -padding ; j < padding ; j++ ) { + if (calls.get(i).stop+j+32 >= reference.get(calls.get(i).chromosome).size()) {continue;} + + // DEBUG + char seq[64]; + get_sequence(reference,calls.get(i).chromosome,calls.get(i).stop+j,calls.get(i).stop+j+32,-1,-1,seq); + seq[33] = 0; + std::cout << "ADDING sequence at " << calls.get(i).chromosome << ":" << calls.get(i).stop+j << " SEQUENCE: " << seq << std::endl; + + add_32bases_pattern(calls.get(i).chromosome,calls.get(i).stop+j, + reference,map,i,j); + } + } + + // Collect evidences for the SV calls + // Samples // calls // sequences // bases + + All_samples_call_related_relevant_read_type seqs; + +/* seqs.resize(files_bams.size()); + + for (int i = 0 ; i < files_bams.size() ; i++) { + collect_reads_for_call_evidence(files_bams.get(i),map,calls,seqs.get<0>(i),reference); + } + + std::cout << "Saving sequences" << std::endl; + seqs.save("seqs_call_vector"); + std::cout << "Saved sequences" << std::endl;*/ + + seqs.load("seqs_call_vector"); + + openfpm::vector<seed_id> seed; + openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> un_seed; + + for (auto i = map.begin() ; i != map.end() ; i++) { + seed.add(); + int last = seed.size() - 1; + seed.get(last).seed = i->first; + for (int j = 0 ; j < i->second.size() ; j++) { + int cid = i->second.get(j).call_id; + + struct seed_pos tmp; + tmp.cid = cid; + tmp.chr = calls.get(i->second.get(j).call_id).chromosome;; + tmp.pos = i->second.get(j).pos_seed;; + + seed.get(last).calls.add(tmp); + } + } + + un_seed.resize(calls.size()); + get_reference_point(seed,reference,un_seed); + + for (int i = 0 ; i < seed.size() ; i++) { + for (int j = 0 ; j < seed.get(i).calls.size() ; j++) { + char seq[64]; + get_sequence(reference,seed.get(i).calls.get(j).chr,seed.get(i).calls.get(j).pos,seed.get(i).calls.get(j).pos+32,-1,-1,seq); + seq[33] = 0; + + std::cout << "SEED POS: " << seed.get(i).calls.get(j).chr << ":" << seed.get(i).calls.get(j).pos << " SEQUENCE: " << seq << " ID " << seed.get(i).seed << std::endl; + } + } + + // Check reference points + for (int cid = 0 ; cid < un_seed.size() ; cid++) { + std::cout << "CALL ID: " << cid << std::endl; + for (int j = 0 ; j < un_seed.get(cid).size() ; j++ ) { + char seq[64]; + get_sequence(reference,un_seed.get(cid).get<0>(j),un_seed.get(cid).get<1>(j),un_seed.get(cid).get<1>(j)+32,-1,-1,seq); + seq[33] = 0; + std::cout << " " << un_seed.get(cid).get<0>(j) << ":" << un_seed.get(cid).get<1>(j) << " SEQUENCE: " << seq << " ID " << seed.get(un_seed.get(cid).get<2>(j)).seed << " K:" << un_seed.get(cid).get<2>(j) << std::endl; + } + } + + double reading_error = 1e-2; + + for (int i = 0 ; i < seqs.size(); i++) { + for (int j = 0 ; j < seqs.get<0>(i).size() ; j++) { + CalculateViterbi(reference,seqs.get<0>(i).get<0>(j),un_seed.get(j),calls.get(i),padding,chr_ins.get<0>(i)); + } + } +} + +void read_bed_file(std::string bed_f, openfpm::vector<aggregate<short int,int,int>> & intervals,int pad) { + std::ifstream bedFile(bed_f.c_str()); + + if (!bedFile.is_open()) { + std::cerr << "Error: Unable to open the BED file." << std::endl; + return; + } + + std::string line; + + char str[256]; + + while (std::getline(bedFile, line)) { + std::istringstream iss(line); + + iss >> str; + int chr_int = chr_to_chr_int(str); + if (chr_int == -1) {continue;} + + intervals.add(); + intervals.last().get<0>() = chr_int; + iss >> intervals.last().get<1>(); + iss >> intervals.last().get<2>(); + + intervals.last().get<2>() += 1; + + if (intervals.last().get<2>() <= intervals.last().get<1>()) { + std::cout << "Bed file are interpreted as start (included) and stop (included), based on this standard the region " << chr_int + 1 << " " << intervals.last().get<1>() << " " << intervals.last().get<2>() << std::endl; + std::cout << "Note aside internally we use the more conventional (included) (excluded) it look like that famous enrichment kits has the above fetish ... or they do not know what they are doing" << std::endl; + } + } + + bedFile.close(); +} + + +template<typename T> T penalization_fishing(T & a, T left, T right) { + T min_distance = std::min(fabs(a-left)*1000,fabs(a-right)*1000); + + return min_distance; +} + + +std::string GRCh37_length[] = {"249250621", + "243199373", + "198022430", + "191154276", + "180915260", + "171115067", + "159138663", + "146364022", + "141213431", + "135534747", + "135006516", + "133851895", + "115169878", + "107349540", + "102531392", + "90354753", + "81195210", + "78077248", + "59128983", + "63025520", + "48129895", + "51304566", + "155270560", + "59373566", + "16569"}; + +std::string GRCh37_contig[] = {"1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y", + "MT"}; + +const int GRCh37_n_contig = 24; + +int set_header_assembly(bcf_hdr_t* header, htsFile* vcfFile, const std::string & assembly) { + if (assembly == "GRCh37") { + for (int i = 0 ; i < GRCh37_n_contig ; i++) { + // Create the header + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("contig"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, GRCh37_contig[i].c_str(), strlen(GRCh37_contig[i].c_str()), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "length", strlen("length")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, GRCh37_length[i].c_str(), strlen(GRCh37_length[i].c_str()), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "assembly", strlen("assembly")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "b37", strlen("b37"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + } + } + + return 0; +} + +int set_header_format(bcf_hdr_t* header, htsFile* vcfFile) { + + // Set GT + { + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("FORMAT"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "GT", strlen("GT"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Number", strlen("Number")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "1", strlen("1"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Type", strlen("Type")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "String", strlen("String"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Description", strlen("Description")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Segment genotype", strlen("Segment genotype"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + } + + // Set QS + + { + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("FORMAT"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "QS", strlen("QS"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Number", strlen("Number")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "1", strlen("1"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Type", strlen("Type")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Float", strlen("Float"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Description", strlen("Description")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Quality of the call, number of expected false positives", strlen("Quality of the call, number of expected false positives"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + } + + // Set the CN + + { + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("FORMAT"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "CN", strlen("CN"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Number", strlen("Number")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "1", strlen("1"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Type", strlen("Type")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Integer", strlen("Integer"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Description", strlen("Description")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Segment most-likely copy-number call", strlen("Segment most-likely copy-number call"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + } + + // Set END + + { + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("INFO"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "END", strlen("END"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header FORMAT" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Number", strlen("Number")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "1", strlen("1"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Type", strlen("Type")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Integer", strlen("Integer"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Description", strlen("Description")) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "End coordinate of the variant", strlen("End coordinate of the variant"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header contigs." << std::endl; + hts_close(vcfFile); + return 1; + } + } + + return 0; +} + + +int set_header_info(bcf_hdr_t* header, htsFile* vcfFile) { + + // Set PS_START + { + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("INFO"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "PS_START", strlen("PS_START"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Number", strlen("Number")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "1", strlen("1"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Type", strlen("Type")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Integer", strlen("Integer"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Description", strlen("Description")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Indicate if a split-read has been found to identify the precise starting point of the event", + strlen("Indicate if a split-read has been found to identify the precise starting point of the event"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header INFO." << std::endl; + hts_close(vcfFile); + return 1; + } + } + + // Set PS_STOP + + { + bcf_hrec_t *hrec = (bcf_hrec_t *)calloc(1, sizeof(bcf_hrec_t)); + hrec->key = strdup("INFO"); + if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "PS_STOP", strlen("PS_STOP"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Number", strlen("Number")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "1", strlen("1"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Type", strlen("Type")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Integer", strlen("Integer"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_add_key(hrec, "Description", strlen("Description")) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hrec_set_val(hrec, hrec->nkeys-1, "Indicate if a split-read has been found to identify the precise stop point of the event", + strlen("Indicate if a split-read has been found to identify the precise stop point of the event"), 0) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + + if (bcf_hdr_add_hrec(header,hrec) < 0) { + std::cerr << "Error: Cannot create VCF header INFO" << std::endl; + hts_close(vcfFile); + return 1; + } + } + + return 0; +} + +template<unsigned int N_sample_opt> +int create_vcf_calls(openfpm::vector<openfpm::vector<call_vcf>> & calls, openfpm::vector<std::string> & files_bams) { + + for (int i = 0 ; i < calls.size() ; i++) { + std::string str; + openfpm::vector<call_vcf> & cvcf = calls.get(i); + + std::string sample_name = split(basename(files_bams.get(i).c_str()),".")[0]; + + // Initialize VCF header and writer + htsFile* vcfFile = hts_open(std::string(sample_name + ".vcf").c_str(), "w"); + if (vcfFile == nullptr) { + std::cerr << "Error: Cannot open output VCF file" << std::endl; + return 1; + } + + // Create a VCF header + bcf_hdr_t* header = bcf_hdr_init("w"); + if (header == nullptr) { + std::cerr << "Error: Cannot create VCF header." << std::endl; + hts_close(vcfFile); + return 1; + } + + bcf_hdr_add_sample(header,sample_name.c_str()); + + set_header_assembly(header,vcfFile,"GRCh37"); + set_header_format(header,vcfFile); + set_header_info(header,vcfFile); + + // Add sample information to the header if needed + // bcf_hdr_add_sample(header, "Sample1"); + + // Write the VCF header to the output file + if (bcf_hdr_write(vcfFile, header) != 0) { + std::cerr << "Error: Cannot write VCF header" << std::endl; + bcf_hdr_destroy(header); + hts_close(vcfFile); + return 1; + } + + // Create a VCF record + bcf1_t* record = bcf_init1(); + if (record == nullptr) { + std::cerr << "Error: Cannot create VCF record" << std::endl; + bcf_hdr_destroy(header); + hts_close(vcfFile); + return 1; + } + + + for (int j = 0 ; j < cvcf.size() ; j++) { + std::stringstream ss; + + + ss << chr_int_to_chr(cvcf.get(j).chromosome+1) << "\t"; + ss << cvcf.get(j).start << "\t"; + ss << "CNV_" << cvcf.get(j).chromosome << "_" << cvcf.get(j).start << "_" << cvcf.get(j).stop << "_" << cvcf.get(j).CN << "\t"; + ss << "N" << "\t"; + if (cvcf.get(j).CN < 2) { + ss << "<DEL>" << "\t"; + } else { + ss << "<DUP>" << "\t"; + } + + // QUALITY + ss << cvcf.get(j).quality << "\t"; + + // FILTER + ss << "." << "\t"; + + // INFO + std::string info; + if (cvcf.get(j).precise_start == true) { + info += "PS_START=1"; + } + if (cvcf.get(j).precise_stop == true) { + if (info.size() != 0) { + info += ";"; + } + ss << "PS_STOP=1"; + } + if (info.size() != 0) { + info += ";"; + } + info += "END=" + std::to_string(cvcf.get(j).stop); + + if (info.size() != 0) { + ss << info << "\t"; + } else { + ss << "." << "\t"; + } + + ss << "GT:CN:QS" << "\t"; + if (cvcf.get(j).CN == 0) { + ss << "1/1"; + } else if (cvcf.get(j).CN == 1) { + ss << "0/1"; + } else { + ss << "./."; + } + ss << ":" << cvcf.get(j).CN << ":" << cvcf.get(j).quality; + kstring_t s; + ks_initialize(&s); + kputs(ss.str().c_str(),&s); + vcf_parse(&s,header,record); + + if (bcf_write1(vcfFile, header, record) != 0) { + std::cerr << "Error: Cannot write VCF record" << std::endl; + } + } + + // Clean up resources + bcf_destroy1(record); + bcf_hdr_destroy(header); + hts_close(vcfFile); + } + + + + return 0; +} + +} \ No newline at end of file diff --git a/example/Sequencing/CNVcaller/main.cu b/example/Sequencing/CNVcaller/main.cu index 3af9f7b43..c7d13604e 100644 --- a/example/Sequencing/CNVcaller/main.cu +++ b/example/Sequencing/CNVcaller/main.cu @@ -38,7 +38,7 @@ int main(int argc, char* argv[]) // Read bed file for interval std::cout << "READING: Bed file" << std::endl; - read_bed_file("/nvme/bams/Twist_Comprehensive_Exome_Merged_Probes_hg19_liftover_sorted.bed",intervals,0); + CNVcaller::read_bed_file("/nvme/bams/Twist_Comprehensive_Exome_Merged_Probes_hg19_liftover_sorted.bed",intervals,0); openfpm::vector<std::string> files_bams; @@ -54,31 +54,37 @@ int main(int argc, char* argv[]) files_bams.add("/nvme/bams/batch_1/DE07NGSUKBD138165_96729.bam"); files_bams.add("/nvme/bams/batch_1/DE12NGSUKBD138172_95188.bam"); - openfpm::vector<openfpm::vector<char>> reference; - //read_fasta("/nvme/databases/human_g1k_v37_decoy.fasta",reference); - reference.load("reference_vector"); - openfpm::vector<call_vcf> calls_to_test; - read_CNV_hypothesys("/nvme/databases/GRCh37_hg19_variants_2020-02-25.vcf.gz",calls_to_test); - openfpm::vector<openfpm::vector<seqs_call>> seqs; - CNV_event_test(calls_to_test,reference,seqs,files_bams,6); - - return 0; + openfpm::vector<aggregate<openfpm::vector<int>>> chr_ins; int N_samples = files_bams.size(); + int N_chr = 24; + #ifndef SKIP_LOAD // Read break points - read_splits.resize(N_samples); + /* read_splits.resize(N_samples); + chr_ins.resize(N_samples); for (int i = 0 ; i < files_bams.size() ; i++) { - read_break_points(files_bams.get(i),read_splits.get(i)); + CNVcaller::read_break_points(files_bams.get(i),read_splits.get(i),N_chr,chr_ins.get<0>(i)); std::cout << "Break intervals: " << intervals.size() << std::endl; - break_intervals(intervals,read_splits.get(i)); + CNVcaller::break_intervals(intervals,read_splits.get(i)); std::cout << "New intervals: " << intervals.size() << std::endl; - } + }*/ #endif + chr_ins.load("chr_ins_vector"); + openfpm::vector<openfpm::vector<char>> reference; + //read_fasta("/nvme/databases/human_g1k_v37_decoy.fasta",reference); + reference.load("reference_vector"); + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("/nvme/databases/GRCh37_hg19_variants_2020-02-25_one_CNV.vcf.gz",calls_to_test); + //openfpm::vector<openfpm::vector<CNVcaller::seqs_call>> seqs; + CNVcaller::CNV_event_test(calls_to_test,reference,files_bams,6,chr_ins); + + return 0; + constexpr int N_int_opt = 1; constexpr int N_sample_opt = 10; @@ -88,21 +94,16 @@ int main(int argc, char* argv[]) openfpm::vector<aggregate<float[N_sample_opt]>> confidence2; openfpm::vector<int> counts; - call_single_interval_mean_depth<N_int_opt,N_sample_opt>(intervals,files_bams,counts,solutions,confidence); - call_single_interval_mean_depth_local<N_int_opt,N_sample_opt>(intervals,files_bams,counts,solutions,solutions2,confidence2); + CNVcaller::call_single_interval_mean_depth<N_int_opt,N_sample_opt>(intervals,files_bams,counts,solutions,confidence); + CNVcaller::call_single_interval_mean_depth_local<N_int_opt,N_sample_opt>(intervals,files_bams,counts,solutions,solutions2,confidence2); - openfpm::vector<openfpm::vector<call_vcf>> calls; + openfpm::vector<openfpm::vector<CNVcaller::call_vcf>> calls; calls.resize(N_sample_opt); // Get only overlapping and consistent calls for (int j = 0 ; j < N_samples ; j++) { for (int i = 0 ; i < solutions.size() ; i++) { - if (intervals.get<1>(i) == 109650540){ - int debug = 0; - debug++; - } - if (solutions.get<0>(i)[j+1] != 2 && solutions2.get<0>(i)[j+1] == solutions.get<0>(i)[j+1]) { calls.get(j).add(); calls.get(j).last().chromosome = intervals.get<0>(i); diff --git a/example/Sequencing/CNVcaller/testing.cpp b/example/Sequencing/CNVcaller/testing.cpp new file mode 100644 index 000000000..2382594dc --- /dev/null +++ b/example/Sequencing/CNVcaller/testing.cpp @@ -0,0 +1,46 @@ +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include "functions.hpp" + +BOOST_AUTO_TEST_SUITE( CNV_caller_test_suite ) + +BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern ) +{ + openfpm::vector<openfpm::vector<char>> reference; + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + reference.load("reference_vector"); + + // chromosome 0 + add_32bases_pattern(0,44605910,reference,map,-1,-2); + + BOOST_REQUIRE_EQUAL(map.size(),1); + auto i = map.begin(); + BOOST_REQUIRE_EQUAL(i->first >> 62,0); + BOOST_REQUIRE_EQUAL(i->first & 0x3,3); + + + + map.clear(); + + add_32bases_pattern(0,44605911,reference,map,-1,-2); + + BOOST_REQUIRE_EQUAL(map.size(),1); + i = map.begin(); + BOOST_REQUIRE_EQUAL(i->first >> 62,3); + BOOST_REQUIRE_EQUAL(i->first & 0x03,2); +} + +BOOST_AUTO_TEST_SUITE_END() + +// initialization function: +bool init_unit_test() +{ + return true; +} + +// entry point: +int main(int argc, char* argv[]) +{ + return boost::unit_test::unit_test_main( &init_unit_test, argc, argv ); +} \ No newline at end of file -- GitLab