diff --git a/configure b/configure index 71ff09bff618a95464435628f077fb9b2a1c8e7e..17ada5078e9b3ff4169102070147ed39640a243a 100755 --- a/configure +++ b/configure @@ -596,10 +596,10 @@ cd build ## remove enerything echo "Calling cmake ../. $conf_options" -printf "cmake ../. $conf_options" > cmake_build_options +printf "cmake ../. $conf_options" > cmake_build_options rm ../openfpm_devices/error_code rm ../error_code -DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. $conf_options +DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. --trace $conf_options if [ $? != 0 ]; then #ok something went wrong the install script analyze the return code to potentially fix the problem automatically # Read the error code and exit with that diff --git a/example/Sequencing/CNVcaller/functions.hpp b/example/Sequencing/CNVcaller/functions.hpp index 3cabc347ced20818faad61e81f1deb2289f5142e..f58cf4c855ed388eebd1be3f1d27fd92be5e9519 100644 --- a/example/Sequencing/CNVcaller/functions.hpp +++ b/example/Sequencing/CNVcaller/functions.hpp @@ -2431,7 +2431,7 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, } if (seqs.get<0>(cid).get<0>(last).size() < aln->core.l_qname) { - seqs.get<0>(cid).get<0>(last).resize(aln->core.l_qname); + seqs.get<0>(cid).get<0>(last).resize(aln->core.l_qname+1); } for (int s = 0 ; s < aln->core.l_qname ; s++) { seqs.get<0>(cid).get<0>(last).template get<1>(s) = read_name[s]; @@ -2469,7 +2469,8 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, } void collect_reads_for_call_evidence_reverse(std::string & bam_file, - Sample_call_related_relevant_read_type & seqs) { + Sample_call_related_relevant_read_type & seqs, + int num_calls) { int actual_chr = -1; @@ -2508,6 +2509,9 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, } } + openfpm::vector<openfpm::vector<size_t>> read_to_calls; + read_to_calls.resize(num_calls); + map_reads.clear(); for (int i = 0 ; i < seqs.size() ; i++) { @@ -2519,7 +2523,20 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, k++; } - map_reads[str] = ((size_t)i << 32) + j; + // if the entry str in map_reads already exists it means that the read has been selected by two calls + + if (map_reads.find(str) != map_reads.end()) { + // found + size_t id = map_reads[str]; + + read_to_calls.get(id).add(((size_t)i << 32) + j); + } + else { + read_to_calls.add(); + read_to_calls.last().add(((size_t)i << 32) + j); + + map_reads[str] = read_to_calls.size()-1; + } } } @@ -2536,7 +2553,7 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, auto flag = aln->core.flag; uint8_t *q = bam_get_seq(aln); //quality string - uint32_t q2 = aln->core.qual ; + uint32_t q2 = aln->core.qual; short int chr_int = chr_to_chr_int(chr); if (chr_int == -1) {continue;} @@ -2546,18 +2563,22 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, if (map_reads.find(str) != map_reads.end()) { // found - int j = it->second & 0xffffffff; - int i = it->second >> 32; + int id = it->second; - if ((flag & 0x20) == 0) { - if (seqs.get<0>(i).get<0>(j).size() < len) { - int old_len = seqs.get<0>(i).get<0>(j).size(); - seqs.get<0>(i).get<0>(j).resize(len); - seqs.get<0>(i).get<0>(j).get<0>(old_len) = -1; - } + for (int v = 0 ; v < read_to_calls.get(id).size() ; v++) { + int j = read_to_calls.get(id).get(v) & 0xffffffff; + int i = read_to_calls.get(id).get(v) >> 32; - for (int k = 0 ; k < len ; k++) { - seqs.get<0>(i).get<0>(j).get<1>(k) = seq_nt16_str[bam_seqi(q,k)]; + if ((flag & 0x20) == 0) { + if (seqs.get<0>(i).get<0>(j).size() < len) { + int old_len = seqs.get<0>(i).get<0>(j).size(); + seqs.get<0>(i).get<0>(j).resize(len); + seqs.get<0>(i).get<0>(j).get<0>(old_len) = -1; + } + + for (int k = 0 ; k < len ; k++) { + seqs.get<0>(i).get<0>(j).get<1>(k) = seq_nt16_str[bam_seqi(q,k)]; + } } } } @@ -2585,7 +2606,7 @@ void collect_reads_for_call_evidence(std::string & bam_file, seqs.resize(calls.size()); collect_reads_for_call_evidence_forward(bam_file,map,calls,seqs,reference,padding,sub_len); - collect_reads_for_call_evidence_reverse(bam_file,seqs); + collect_reads_for_call_evidence_reverse(bam_file,seqs,calls.size()); // std::cout << "COMPLETED " << bam_file << std::endl; @@ -2757,7 +2778,8 @@ size_t create_seed_variation_from_variant(int chrom, int n_alt = std::strlen(alt); size_t bases_pattern_hole = bases_pattern; - for (int k = 0 ; k < n_ref ; k++) { + int k = (ref[0] == alt[0])?1:0; + for ( ; k < n_ref && (pos_inv-k) >= 0 ; k++) { bases_pattern_hole &= ~((size_t)0x3 << ((pos_inv-k)*2)); } @@ -2768,7 +2790,7 @@ size_t create_seed_variation_from_variant(int chrom, bases_pattern_2 = bases_pattern_2 >> (pos+n_alt)*2; // nullify the bases - for (int k = pos ; k < 32 ; k++) { + for (int k = pos+(ref[0] == alt[0]) ; k < 32 ; k++) { bases_pattern_hole &= ~((size_t)0x3 << ((31-k)*2)); } @@ -3042,6 +3064,9 @@ double calculate_event_probability_between_to_sequences(char * seq1, double reading_error) { double align_probability = 0.0; int len = strlen(seq1); + int len2 = strlen(seq2); + + len = (len2 < len)?len2:len; for(size_t j=0;j<len;j++) { align_probability += (seq1[j] == seq2[j])?(log(1.0-reading_error)):log(reading_error); @@ -3057,7 +3082,13 @@ double insert_probability(int ins, openfpm::vector<double> & bp_insert_prob) { return 1.7e-8; } -double calculate_event_probability(char * seq1, +struct ProbsRead { + double log_align_probability; + double log_pair_probability; + double log_insert_probability; +}; + +void calculate_event_probability(char * seq1, char * seq2, char * seq3, int expected_insert_size, @@ -3066,7 +3097,9 @@ double calculate_event_probability(char * seq1, double reading_error, openfpm::vector<int> & chr_ins, openfpm::vector<openfpm::vector<char>> & reference, - openfpm::vector<double> & bp_insert_prob) { + int & best_insert, + openfpm::vector<double> & bp_insert_prob, + ProbsRead & probs) { char seq4[16384]; double align_probability = calculate_event_probability_between_to_sequences(seq1,seq2,reading_error); @@ -3097,7 +3130,11 @@ double calculate_event_probability(char * seq1, } } - return align_probability + max; + best_insert = max_i; + + probs.log_align_probability = align_probability; + probs.log_pair_probability = max; + probs.log_insert_probability = log(insert_probability(abs(max_i),bp_insert_prob)); } struct seed_pos{ @@ -3171,6 +3208,37 @@ void get_reference_point(openfpm::vector<seed_id> & seed, } } +void create_event_cigar(openfpm::vector<uint32_t> & cigar, + int i_shift, + int j_shift, + int pos_event, + int end, + call_vcf & call) { + cigar.resize(3); + + cigar.get(0) = (call.start + i_shift - pos_event) << BAM_CIGAR_SHIFT | BAM_CMATCH; + cigar.get(1) = (call.stop+j_shift-call.start-i_shift) << BAM_CIGAR_SHIFT | BAM_CDEL; + cigar.get(2) = end << BAM_CIGAR_SHIFT | BAM_CMATCH; +} + +void create_ref_cigar(openfpm::vector<uint32_t> & cigar, + int len) { + cigar.clear(); + cigar.resize(1); + + cigar.get(0) = len << BAM_CIGAR_SHIFT | BAM_CMATCH; +} + +void adjust_pos_by_event(int & pos, int & pos_event, call_vcf & call, int i ,int j) { + if (pos < call.stop+j && + pos > call.start+i ) { + pos_event = call.start+i -(call.stop+j - pos); + } + else { + pos_event = pos; + } +} + Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & reference, const Pair_read_type & seq, openfpm::vector<aggregate<short int, int, int>> & un_seed, @@ -3180,9 +3248,11 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe int pos, int distance_from_seed, openfpm::vector<int> & chr_ins, + Eigen::ArrayXd & best_insert, openfpm::vector<double> & bp_insert_prob) { Eigen::ArrayXd events; events.resize((2*padding+1)*(2*padding+1) + un_seed.size()); + best_insert.resize(events.size()); // Event 1 reference @@ -3201,42 +3271,49 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe //events(0) = calculate_event_probability(seq2,seq1,seq3,-1,chr,3e-3,chr_ins); std::cout << "Read name: "; - for (int i = 0 ; i < seq.size() ; i++) { + for (int i = 0; i < seq.size() && seq.get<2>(i) != 0 ; i++) { std::cout << seq.get<2>(i); } - std::cout << std::endl; - std::cout << "Distance from seed: " << distance_from_seed << std::endl; + //std::cout << std::endl; - // Event 2 break + //std::cout << "Distance from seed: " << distance_from_seed << std::endl; - int i = 0; - int j = 0; + // Event 2 break for (int i = -padding ; i <= padding ; i++) { for (int j = -padding ; j <= padding ; j++) { int id = (i+padding)*(2*padding+1) + (j+padding); int pos_event; - // check if the start is inside the call - if (pos < call.stop && - pos > call.start ) { - pos_event = call.start -(call.stop - pos); - } - else { - pos_event = pos; - } - get_sequence(reference,chr,pos_event,pos_event+seq.size()+(call.stop+j-call.start-i),call.start+i,call.stop+j,seq1); - events(id) = calculate_event_probability(seq2,seq1,seq3,call.stop-call.start,chr,pos,3e-3,chr_ins,reference,bp_insert_prob); + adjust_pos_by_event(pos,pos_event,call,i,j); + + ProbsRead probs; + int best_insert_; + + // it overlaps + int overlap_1 = std::max(pos,call.start+i); + int overlap_2 = std::min(pos+(int)seq.size(),call.stop+j); + if (overlap_2-overlap_1 > 0) { + get_sequence(reference,chr,pos_event,pos_event+seq.size()+(call.stop+j-call.start-i),call.start+i,call.stop+j,seq1); + calculate_event_probability(seq2,seq1,seq3,call.stop-call.start,chr,pos,3e-3,chr_ins,reference,best_insert_,bp_insert_prob,probs); + events(id) = probs.log_align_probability + probs.log_pair_probability + probs.log_insert_probability; + best_insert(id) = best_insert_; + } else { + events(id) = -5000.0; + best_insert(id) = -1; + } + // Print - if (events(id) > -60.0) { + if (events(id) > -1.0) { std::cout << "--------------- " << i << "," << j << " --------------------------------- event: " << id << std::endl; std::cout << "EVENT ID: " << id << std::endl; - for (int j = 0 ; j < seq.size() ; j++) { + int len = strlen(seq2); + for (int j = 0 ; j < len ; j++) { if (j == distance_from_seed) { std::cout << "*" << seq2[j]; } else { @@ -3245,7 +3322,8 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe } std::cout << std::endl; - for (int j = 0 ; j < seq.size() ; j++) { + len - strlen(seq1); + for (int j = 0 ; j < len ; j++) { if (j == distance_from_seed) { std::cout << "*" << seq1[j]; } else { @@ -3254,7 +3332,7 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe } std::cout << std::endl; - std::cout << " Probability: " << events(id) << std::endl; + std::cout << " Probability: " << events(id) << " align: " << probs.log_align_probability << " pair align: " << probs.log_pair_probability << " insert: " << probs.log_insert_probability << std::endl; } // Create a file @@ -3265,13 +3343,17 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe // for all unaligned points (max 100) for (int i = 0 ; i < un_seed.size() ; i++) { - get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i),un_seed.get<1>(i)+32,-1,-1,seq1); - + //get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i),un_seed.get<1>(i)+32,-1,-1,seq1); get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i)-distance_from_seed,un_seed.get<1>(i)-distance_from_seed+seq.size(),-1,-1,seq1); int id = (2*padding+1)*(2*padding+1) + i; - events(id) = calculate_event_probability(seq2,seq1,seq3,-1,chr,un_seed.get<1>(i)-distance_from_seed+1,3e-3,chr_ins,reference,bp_insert_prob); - if (events(id) > -60.0) { + ProbsRead probs; + int best_insert_; + calculate_event_probability(seq2,seq1,seq3,-1,chr,un_seed.get<1>(i)-distance_from_seed+1,3e-3,chr_ins,reference,best_insert_,bp_insert_prob,probs); + events(id) = probs.log_align_probability + probs.log_pair_probability + probs.log_insert_probability; + best_insert(id) = best_insert_; + + if (events(id) > -1.0) { std::cout << "------------------- un_seed: " << i << " ----------------------------- event: " << id << std::endl; std::cout << id << " SEED position: " << un_seed.get<1>(i) << " SEED ID: " << un_seed.get<2>(i) << std::endl; @@ -3297,7 +3379,7 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe } std::cout << std::endl; - std::cout << " Probability: " << events(id) << std::endl; + std::cout << " Probability: " << events(id) << " align: " << probs.log_align_probability << " pair align: " << probs.log_pair_probability << " insert: " << probs.log_insert_probability << std::endl; } } @@ -3402,8 +3484,10 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, randomPermutation(seqs.size(),permutation); openfpm::vector<Eigen::ArrayXd> delta_log; + openfpm::vector<Eigen::ArrayXd> insert; openfpm::vector<Eigen::ArrayXd> Pre; delta_log.resize(seqs.size()); + insert.resize(seqs.size()); Pre.resize(seqs.size()); int s = permutation.get(0)/*673*/; @@ -3416,7 +3500,9 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, // so the formula delta = Initial*Evidence_matrix in logaritm space become // delta_log = log(Initial*Evidemce_matrix) // delta_log = Initial_log + Evidence_matrix_log - delta_log.get(0) = Initial_log.array() + Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins,bp_insert_prob); + Eigen::ArrayXd best_insert; + + delta_log.get(0) = Initial_log.array() + Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins,best_insert,bp_insert_prob); int x; auto mC = delta_log.get(0).maxCoeff(&x); std::cout << "BEST event: " << x << " " << mC << std::endl; @@ -3474,7 +3560,8 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, int x; double coeff = next_transition.maxCoeff(&x); - Eigen::ArrayXd eml = Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins,bp_insert_prob); + best_insert.resize(Initial_log.size()); + Eigen::ArrayXd eml = Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins,best_insert,bp_insert_prob); auto mC = eml.maxCoeff(&x); std::cout << "Best event: " << x << " " << mC << std::endl; std::cout << "Next transition: " << eml << std::endl; diff --git a/example/Sequencing/CNVcaller/testing.cpp b/example/Sequencing/CNVcaller/testing.cpp index e9e196d80ec0a012e116263765579682f1b1ad74..a0664974ffeaeb0f2cbaad51db6fda555465b9c2 100644 --- a/example/Sequencing/CNVcaller/testing.cpp +++ b/example/Sequencing/CNVcaller/testing.cpp @@ -304,15 +304,50 @@ BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern_with_variability_range BOOST_REQUIRE(map.find(s4_s)->second.get(0).padding_left == 32); } +} - // print all seeds in map and check that align with the expected +BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern_with_variability_range_shift_big_del ) +{ + openfpm::vector<openfpm::vector<char>> reference; + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; -/* auto i = map.begin(); + reference.load("reference_vector"); - while (i != map.end()) { - std::cout << CNVcaller::string_from_32bases_pattern(i->first) << std::endl; - i++; - }*/ + // chromosome 3 // 195452062 + add_32bases_pattern_with_variability_range_shift(2,195452504,195452504+2,8,reference,map,"test/gnomad_variants_variability_test.vcf.gz",77,195452504-195451992+32,NO_PADDING); + + BOOST_REQUIRE_EQUAL(map.size(),5); + + // From 30 + + { + // check that these seed are presents + std::string s1("CCTCAGAGAGCAGCGCCTCTTCCGACGGCCTC"); // reference + std::string s2("CCTCTCAGAGAGCAGCGCCTCTTCCGACGGCC"); // variants + std::string s3("CCTCAGAGAGCAGCGCCTCTTCCGACATCCAG"); + std::string s4("CCTCAGAGAGCAGCGCCTCTTCCGACAGCCTC"); + std::string s5("CCTCAGAGAGCAGCGCCTCTTCCGACGGCCCC"); + + + size_t s1_s = CNVcaller::convert_to_bit_pattern(s1.c_str()); + size_t s2_s = CNVcaller::convert_to_bit_pattern(s2.c_str()); + size_t s3_s = CNVcaller::convert_to_bit_pattern(s3.c_str()); + size_t s4_s = CNVcaller::convert_to_bit_pattern(s4.c_str()); + size_t s5_s = CNVcaller::convert_to_bit_pattern(s5.c_str()); + + BOOST_REQUIRE(map.find(s1_s) != map.end()); + BOOST_REQUIRE(map.find(s2_s) != map.end()); + BOOST_REQUIRE(map.find(s3_s) != map.end()); + BOOST_REQUIRE(map.find(s4_s) != map.end()); + BOOST_REQUIRE(map.find(s5_s) != map.end()); + + + BOOST_REQUIRE_EQUAL(map.find(s1_s)->second.get(0).padding_left,544); + BOOST_REQUIRE_EQUAL(map.find(s2_s)->second.get(0).padding_left,544); + BOOST_REQUIRE_EQUAL(map.find(s3_s)->second.get(0).padding_left,544); + BOOST_REQUIRE_EQUAL(map.find(s4_s)->second.get(0).padding_left,544); + BOOST_REQUIRE_EQUAL(map.find(s5_s)->second.get(0).padding_left,544); + } } BOOST_AUTO_TEST_CASE( CNV_caller_test_32_bases_to_string ) @@ -754,7 +789,10 @@ BOOST_AUTO_TEST_CASE( CNV_caller_tests_calculate_event_probability ) chr_ins.get(i) = 210; } - double probability = CNVcaller::calculate_event_probability(seq2,seq1,seq3,-1,chr,pos,3e-3,chr_ins,reference,bp_insert_prob); + CNVcaller::ProbsRead probs; + int best_insert; + CNVcaller::calculate_event_probability(seq2,seq1,seq3,-1,chr,pos,3e-3,chr_ins,reference,best_insert,bp_insert_prob,probs); + double probability = probs.log_align_probability + probs.log_pair_probability + probs.log_insert_probability; BOOST_REQUIRE_CLOSE(probability,-13.923319055604425,0.001); } @@ -911,6 +949,7 @@ BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event ) seed_pos = seqs.get<0>(0).get<0>(j).get<4>(i); + Eigen::ArrayXd insert; Eigen::ArrayXd eml3 = Evidence_matrix_log(reference, seqs.get<0>(0).get<0>(j).get<0>(i), un_seed.get(j), @@ -920,6 +959,7 @@ BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event ) seqs.get<0>(0).get<0>(j).get<2>(i), seed_pos, chr_ins, + insert, bp_insert_prob); std::cout << "Probability: " << eml3 << std::endl; @@ -1003,8 +1043,11 @@ BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event_with_variab for (int j = 0 ; j < seqs.get<0>(0).size() ; j++) { for (int i = 0 ; i < seqs.get<0>(0).get<0>(j).size() ; i++) { + std::cout << "j " << j << "/" << seqs.get<0>(0).size() << " i " << "/" << seqs.get<0>(0).get<0>(j).size() << std::endl; + size_t seed_pos = seqs.get<0>(0).get<0>(j).get<4>(i); + Eigen::ArrayXd insert; Eigen::ArrayXd eml3 = Evidence_matrix_log(reference, seqs.get<0>(0).get<0>(j).get<0>(i), un_seed.get(j), @@ -1014,10 +1057,9 @@ BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event_with_variab seqs.get<0>(0).get<0>(j).get<2>(i), seed_pos, chr_ins, + insert, bp_insert_prob); - std::cout << "Probability: " << eml3 << std::endl; - BOOST_REQUIRE(eml3(59) > -20.0 ); BOOST_REQUIRE(eml3(121) < -20.0); } @@ -1042,7 +1084,9 @@ BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event_with_variab int offset = 0; // call id 1 - add_32bases_pattern_with_variability_range_shift(2,195452561,195452561+32,8,reference,map,"test/gnomad_variants_variability_test.vcf.gz",76,195452561-195451992+32,NO_PADDING); + add_32bases_pattern_with_variability_range_shift(2,195452504,195452504+32,8,reference,map,"test/gnomad_variants_variability_test.vcf.gz",76,195452504-195451992+32,NO_PADDING); + + std::cout << "CALL " << calls_to_test.get(76).chromosome << ":" << calls_to_test.get(76).start << "-" << calls_to_test.get(76).stop << std::endl; auto i = map.begin(); @@ -1102,30 +1146,427 @@ BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event_with_variab // Get the seed number associated to the call int seed_num = 0; + int padding = 6; + // For rach read + int n_read_check = 0; for (int j = 0 ; j < seqs.get<0>(0).size() ; j++) { for (int i = 0 ; i < seqs.get<0>(0).get<0>(j).size() ; i++) { size_t seed_pos = seqs.get<0>(0).get<0>(j).get<4>(i); + Eigen::ArrayXd insert; Eigen::ArrayXd eml3 = Evidence_matrix_log(reference, seqs.get<0>(0).get<0>(j).get<0>(i), un_seed.get(j), calls_to_test.get(j), - 6, + padding, seqs.get<0>(0).get<0>(j).get<1>(i), seqs.get<0>(0).get<0>(j).get<2>(i), seed_pos, chr_ins, + insert, bp_insert_prob); - std::cout << "Probability: " << eml3 << std::endl; + double best_event = -1000.0; + int k = 0; + double best_ref = -1000.0; + int best_ref_seed_pos = -1; + + for (int i = 0 ; i < (2*padding+1)*(2*padding+1) ; i++) { + if (eml3(i) > best_event) { + best_event = eml3(i); + } + } + + for (int i = (2*padding+1)*(2*padding+1) ; i < eml3.size() ; i++) { + if (eml3(i) > best_ref) { + best_ref = eml3(i); + k = i; + best_ref_seed_pos = un_seed.get(j).get<1>(i-(2*padding+1)*(2*padding+1)); + } + } + + char seq[64]; + memset(seq,0,64); + + for (int k = 0 ; seqs.get<0>(0).get<0>(j).get<0>(i).get<2>(k) != 0 ; k++) { + seq[k] = seqs.get<0>(0).get<0>(j).get<0>(i).get<2>(k); + } + + std::cout << " j=" << j << "/" << seqs.get<0>(0).size() << " i=" << i << "/" << seqs.get<0>(0).get<0>(j).size() << + " Event vs Reference " << best_event << "/" << best_ref << " " << " best ref seed pos: " << best_ref_seed_pos << " " + << ((best_event > best_ref)?"#":"") << std::endl; + + if (strcmp("A00684:261:HNJK2DRX2:1:2264:6659:10332",&seq[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:1:2150:3233:9878",&seq[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:2:2205:4806:33144",&seq[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:2:2245:6406:36198",&seq[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:2:2250:27896:15702",&seq[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } + + } } + BOOST_REQUIRE_EQUAL(n_read_check,5); } +BOOST_AUTO_TEST_CASE(write_a_bam_file) +{ + std::string bam_file("test/HG002_reads_cnv_MUC20.bam"); + + samFile *fp_in = hts_open(bam_file.c_str(),"r"); + samFile *fp_out = hts_open("test.bam","wb"); //open bam file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + int r; + r = sam_hdr_write(fp_out, bamHdr); + bam1_t *aln = bam_init1(); //initialize an alignment + + const char *qname = "q1"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "FFFFFFFFFF"; + + bam_set1(aln, strlen(qname), qname, + BAM_FPAIRED | BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FREAD2, 2, 195452541, 42, + sizeof(cigar) / 4, cigar, 2, 195452561, 42, + strlen(seq), seq, qual, 64); + + r = sam_write1(fp_out, bamHdr, aln); + + bam_set1(aln, strlen(qname), qname, + BAM_FPAIRED | BAM_FPROPER_PAIR | BAM_FREVERSE | BAM_FREAD1, 2, 195452561, 42, + sizeof(cigar) / 4, cigar, 2, 195452541, 42, + strlen(seq), seq, qual, 64); + + r = sam_write1(fp_out, bamHdr, aln); + + bam_destroy1(aln); + + // close file + hts_close(fp_out); +} + +// Many read events +BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event_with_variability_many_reads_2calls ) +{ + // + std::string bam_file("test/HG002_reads_cnv_MUC20.bam"); + + samFile *fp_in = hts_open(bam_file.c_str(),"r"); + + samFile *fp_out77_best = hts_open("call77_best.bam","wb"); //open bam file + samFile *fp_out78_best = hts_open("call78_best.bam","wb"); //open bam file + samFile *fp_out79_best = hts_open("call79_best.bam","wb"); //open bam file + + samFile *fp_out77_event = hts_open("call77_event.bam","wb"); //open bam file + samFile *fp_out78_event = hts_open("call78_event.bam","wb"); //open bam file + samFile *fp_out79_event = hts_open("call79_event.bam","wb"); //open bam file + + samFile *fp_out77_ref = hts_open("call77_ref.bam","wb"); //open bam file + samFile *fp_out78_ref = hts_open("call78_ref.bam","wb"); //open bam file + samFile *fp_out79_ref = hts_open("call79_ref.bam","wb"); //open bam file + + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + int r; + + r = sam_hdr_write(fp_out77_best, bamHdr); + r = sam_hdr_write(fp_out78_best, bamHdr); + r = sam_hdr_write(fp_out79_best, bamHdr); + + r = sam_hdr_write(fp_out77_event, bamHdr); + r = sam_hdr_write(fp_out78_event, bamHdr); + r = sam_hdr_write(fp_out79_event, bamHdr); + + r = sam_hdr_write(fp_out77_ref, bamHdr); + r = sam_hdr_write(fp_out78_ref, bamHdr); + r = sam_hdr_write(fp_out79_ref, bamHdr); + + bam1_t *aln = bam_init1(); //initialize an alignment + + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/DGV_variants_MUC20_2calls.vcf.gz",calls_to_test); + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + All_samples_call_related_relevant_read_type seqs; + + // remeber is 0-based index, call id is 0, padding between call-start and seed is -1 + int offset = 0; + + // call id 1 + add_32bases_pattern_with_variability_range_shift(2,195452504,195452504+32,8,reference,map,"test/gnomad_variants_variability_test.vcf.gz",77,195452504-195451992+32,NO_PADDING); + add_32bases_pattern_with_variability_range_shift(2,195452561,195452561+32,8,reference,map,"test/gnomad_variants_variability_test.vcf.gz",78,195452561-195451992+32,NO_PADDING); + add_32bases_pattern_with_variability_range_shift(2,195452447,195452447+32,8,reference,map,"test/gnomad_variants_variability_test.vcf.gz",79,195452447-195451992+32,NO_PADDING); + + std::cout << "CALL " << calls_to_test.get(77).chromosome << ":" << calls_to_test.get(77).start << "-" << calls_to_test.get(77).stop << std::endl; + std::cout << "CALL " << calls_to_test.get(78).chromosome << ":" << calls_to_test.get(78).start << "-" << calls_to_test.get(78).stop << std::endl; + std::cout << "CALL " << calls_to_test.get(79).chromosome << ":" << calls_to_test.get(79).start << "-" << calls_to_test.get(79).stop << std::endl; + + auto i = map.begin(); + + while (i != map.end()) { + std::cout << CNVcaller::string_from_32bases_pattern(i->first) << std::endl; + i++; + } + + seqs.resize(3); + + std::string test_bam("test/HG002_reads_cnv_MUC20.bam"); + collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6,3); + + // We check that all read are coupled, for all elements in the map + + for (int i = 0 ; i < seqs.size() ; i++) { + for (int j = 0 ; j < seqs.get<0>(i).size() ; j++) { + for (int k = 0 ; k < seqs.get<0>(i).get<0>(j).size() ; k++) { + auto & pair = seqs.get<0>(i).get<0>(j).get<0>(k); + + if (pair.get<1>(3) != 'A' && pair.get<1>(3) != 'C' && pair.get<1>(3) != 'T' && pair.get<1>(3) != 'G') + { + seqs.get<0>(i).get<0>(j).remove(k); + k--; + std::cout << "Error: " << i << " " << j << " " << k << " " << pair.get<1>(3) << std::endl; + } + + BOOST_REQUIRE((pair.get<1>(3) == 'A' || pair.get<1>(3) == 'C' || pair.get<1>(3) == 'T' || pair.get<1>(3) == 'G')); + } + } + } + + openfpm::vector<CNVcaller::seed_id> seed; + convert_seed_map_to_array(seed,map,calls_to_test); + + // Find all reference points + openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> un_seed; + un_seed.resize(calls_to_test.size()); +// get_reference_point(seed,reference,un_seed); +// un_seed.save("test/MUC20_cnv_variability_2_calls"); + un_seed.load("test/MUC20_cnv_variability_2_calls"); + + // Calculate evidence for each of the read pair + + openfpm::vector<int> chr_ins; + chr_ins.resize(23); + + for (int i = 0 ; i < chr_ins.size(); i++) { + chr_ins.get(i) = 210; + } + + // Load insert size probability + + openfpm::vector<double> bp_insert_prob; + bp_insert_prob.load("insert_probability_distribution"); + + // Get the seed number associated to the call + int seed_num = 0; + + int padding = 6; + + int n_read_check = 0; + + // For rach read + for (int j = 0 ; j < seqs.get<0>(0).size() ; j++) { + + samFile *fp_out_best = NULL; + if (j == 77) {fp_out_best = fp_out77_best;} + if (j == 78) {fp_out_best = fp_out78_best;} + if (j == 79) {fp_out_best = fp_out79_best;} + + auto call = calls_to_test.get(j); + + for (int i = 0 ; i < seqs.get<0>(0).get<0>(j).size() ; i++) { + + size_t seed_pos = seqs.get<0>(0).get<0>(j).get<4>(i); + + Eigen::ArrayXd insert_; + Eigen::ArrayXd eml3 = Evidence_matrix_log(reference, + seqs.get<0>(0).get<0>(j).get<0>(i), + un_seed.get(j), + calls_to_test.get(j), + padding, + seqs.get<0>(0).get<0>(j).get<1>(i), + seqs.get<0>(0).get<0>(j).get<2>(i), + seed_pos, + chr_ins, + insert_, + bp_insert_prob); + + const Pair_read_type & seq = seqs.get<0>(0).get<0>(j).get<0>(i); + + double best_event = -1000.0; + double best_ref = -1000.0; + int best_ref_seed_pos = -1; + int best_ei = -1; + + for (int i = 0 ; i < (2*padding+1)*(2*padding+1) ; i++) { + if (eml3(i) > best_event) { + best_event = eml3(i); + best_ei = i; + } + } + + int best_ri = -1; + + for (int i = (2*padding+1)*(2*padding+1) ; i < eml3.size() ; i++) { + if (eml3(i) > best_ref) { + best_ref = eml3(i); + best_ref_seed_pos = un_seed.get(j).get<1>(i-(2*padding+1)*(2*padding+1)); + best_ri = i; + } + } + + char seq1[16384]; + char seq2[16384]; + char name[16384]; + memset(seq1,0,16384); + int kf = 0; + int kr = 0; + int kn = 0; + + { + auto & seq = seqs.get<0>(0).get<0>(j).get<0>(i); + for ( ; kf < seq.size() && seq.get<0>(kf) != 0 ; kf++) { + seq1[kf] = seq.get<0>(kf); + } + + for ( ; kr < seq.size() && seq.get<1>(kr) != 0; kr++) { + seq2[kr] = seq.get<1>(kr); + } + + for ( ; kn < seq.size() && seq.get<2>(kn) != 0 ; kn++) { + name[kn] = seq.get<2>(kn); + } + } + + int chr = seqs.get<0>(0).get<0>(j).get<1>(i); + int pos = seqs.get<0>(0).get<0>(j).get<2>(i); + + char qual[16384]; + memset(seq1,'F',16384); + + int pos_event; + int i_shift = best_ei/(2*padding+1) - padding; + int j_shift = best_ei%(2*padding+1) - padding; + + if (best_ref_seed_pos == 195452504 || best_ref_seed_pos == 195452561 || best_ref_seed_pos == 195452447) { + int debug = 0; + debug++; + } + + CNVcaller::adjust_pos_by_event(pos,pos_event,call,i_shift,j_shift); + + int overlap_1 = std::max(pos,call.start+i_shift); + int overlap_2 = std::min(pos+(int)seq.size(),call.stop+j_shift); + if (overlap_2-overlap_1 > 0) { + + int end = kf - (call.start+i_shift - pos_event); + + CNVcaller::get_sequence(reference,chr,pos_event,call.stop+j_shift+end,call.start+i_shift,call.stop+j_shift,seq1); + + // create event CIGAR + + openfpm::vector<uint32_t> cigar; + + int diff = abs((best_event - best_ref)*10.0); + + if (diff > 255) { + diff = 255; + } + + if (best_event > best_ref) { + CNVcaller::create_event_cigar(cigar,i_shift,j_shift,pos_event,end,call); + + int insert = insert_(best_ei); + + // write forward read + bam_set1(aln, kn, name, + BAM_FPAIRED | BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FREAD2, chr, pos_event, diff, + cigar.size(), &cigar.get(0), chr, pos + insert, insert, + kf, seq1, qual, 64); + + + r = sam_write1(fp_out_best, bamHdr, aln); + + // write reverse read + + CNVcaller::create_ref_cigar(cigar,kr); + + bam_set1(aln, kn, name, + BAM_FPAIRED | BAM_FPROPER_PAIR | BAM_FREVERSE | BAM_FREAD1, chr, pos + insert, 42, + cigar.size(), &cigar.get(0), chr, pos, insert, + kr, seq2, qual, 64); + + r = sam_write1(fp_out_best, bamHdr, aln); + + } else { + CNVcaller::create_ref_cigar(cigar,kf); + + int insert = insert_(best_ri); + int pos = un_seed.get<1>(best_ri - (2*padding+1)*(2*padding+1))-distance_from_seed + + // write forward read + bam_set1(aln, kn, name, + BAM_FPAIRED | BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FREAD2, chr, pos_event, diff, + cigar.size(), &cigar.get(0), chr, pos + insert, insert, + kf, seq1, qual, 64); + + + r = sam_write1(fp_out_best, bamHdr, aln); + + // write reverse read + + CNVcaller::create_ref_cigar(cigar,kr); + + bam_set1(aln, kn, name, + BAM_FPAIRED | BAM_FPROPER_PAIR | BAM_FREVERSE | BAM_FREAD1, chr, pos + insert, 42, + cigar.size(), &cigar.get(0), chr, pos, insert, + kr, seq2, qual, 64); + + r = sam_write1(fp_out_best, bamHdr, aln); + } + } + + if (strcmp("A00684:261:HNJK2DRX2:1:2264:6659:10332",&name[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:1:2150:3233:9878",&name[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:2:2205:4806:33144",&name[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:2:2245:6406:36198",&name[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } else if (strcmp("A00684:261:HNJK2DRX2:2:2250:27896:15702",&name[0]) == 0) { + BOOST_REQUIRE(best_event > best_ref); + n_read_check++; + } + + std::cout << " j=" << j << "/" << seqs.get<0>(0).size() << " i=" << i << "/" << seqs.get<0>(0).get<0>(j).size() << + " Event vs Reference " << best_event << "/" << best_ref << " " << " best ref seed pos: " << best_ref_seed_pos << " " + << ((best_ref_seed_pos == 195452504 || best_ref_seed_pos == 195452561 || best_ref_seed_pos == 195452447)?"*":"") + << ((best_event > best_ref)?"#":"") << std::endl; + } + } + + +} BOOST_AUTO_TEST_SUITE_END()