diff --git a/example/Sequencing/CNVcaller/Makefile b/example/Sequencing/CNVcaller/Makefile index 69d8526cdc6515e1a363be64a4f0ddddf9596cda..a9a0676234083f0181db46a6b3a29ef0bdfc62ef 100644 --- a/example/Sequencing/CNVcaller/Makefile +++ b/example/Sequencing/CNVcaller/Makefile @@ -50,5 +50,5 @@ run: CNV_caller .PHONY: clean all run clean: - rm -f *.o *~ sm_classes/*.o core CNV_caller + rm -f *.o *~ sm_classes/*.o core CNV_caller testing diff --git a/example/Sequencing/CNVcaller/functions.hpp b/example/Sequencing/CNVcaller/functions.hpp index 0de46a936d11b6843533d07117a03d9efd3a3267..2b2de26d71cbc5f3dd3a582d0f77f26323f7a4dd 100644 --- a/example/Sequencing/CNVcaller/functions.hpp +++ b/example/Sequencing/CNVcaller/functions.hpp @@ -13,6 +13,11 @@ #include "hash_map/hopscotch_map.h" #include "hash_map/hopscotch_set.h" #include "sm_classes/SequentialImplementation.h" +#include "htslib/hts.h" +extern "C" { +#include "htslib/synced_bcf_reader.h" +} +#include <string> #define NO_PRINT @@ -33,6 +38,19 @@ typedef openfpm::vector<aggregate<Sample_call_related_relevant_read_type>> All_s namespace CNVcaller { +std::string string_from_32bases_pattern(size_t seq) { + std::string str; + str.resize(32); + for (int i = 31 ; i >= 0 ; i--) { + if ((seq & 0x03) == 0) {str[i] = 'A';} + if ((seq & 0x03) == 1) {str[i] = 'C';} + if ((seq & 0x03) == 2) {str[i] = 'T';} + if ((seq & 0x03) == 3) {str[i] = 'G';} + seq = seq >> 2; + } + return str; +} + short int chr_to_chr_int(const char * chr) { short int chr_int; char chr_base[3]; @@ -84,6 +102,7 @@ template<typename T> T CN_offset(const T & CN) { return (CN - 2.0f)*(CN - 2.0f); } + // template<bool print_terms,int dim, int N_int_opt, int N_sample_opt> // auto model_function(int selected_interval, // Eigen::VectorXd & vars, @@ -1835,7 +1854,7 @@ void read_break_points(std::string & bam_file, int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) if (aln->core.tid == -1) {continue;} - char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) + char *chr = bamHdr->target_name[aln->core.tid]; //contig name (chromosome) uint32_t len = aln->core.l_qseq; //length of the read. int n_cigar = aln->core.n_cigar; // number of cigar operations uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); @@ -2278,6 +2297,11 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, uint32_t q2 = aln->core.qual ; auto flag = aln->core.flag; + if ((flag & 0x20) == 0) + { + continue; + } + short int chr_int = chr_to_chr_int(chr); if (chr_int == -1) {continue;} @@ -2312,12 +2336,23 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, if (fnd->second.get(v).path == 0) { // check the reference std::cout << calls.get(cid).start << " " << fnd->second.get(v).padding_left << " " << i << std::endl; + // This is the start of the read given by the seed position int start = calls.get(cid).start + fnd->second.get(v).padding_left-i-1; + int start_call; + + // check if the start is inside the call + if (start < calls.get(cid).stop && + start > calls.get(cid).start ) { + start_call = calls.get(cid).start -(calls.get(cid).stop - start); + } + else { + start_call = start; + } + + int seed_pos = i-31; int call_distance_from_start = calls.get(cid).start - start; - if (call_distance_from_start > len) {continue;} - n_reads++; if (n_reads % 1000 == 0) {std::cout << "READS to SAVE " << n_reads << " " << chr_int << ":" << pos << std::endl;} @@ -2336,18 +2371,17 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, SequentialImplementation sw1(seq2,seq1,1,0,-1); sw1.runAlgorithm(); - for (int k = -padding ; k <= padding ; k++) { - for (int s = -padding ; s <= padding ; s++) { - if (k == -1 && s == 0) { - int debug = 0; - debug++; - } + int k = 0; + int s = 0; +// for (int k = -padding ; k <= padding ; k++) { +// for (int s = -padding ; s <= padding ; s++) { + - int base_jump = calls.get(cid).start+k - start; + int base_jump = calls.get(cid).start+k - start_call; for (int j = 0 ; j < len ; j++) { - if (start + j < calls.get(cid).start+k) { - seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start+j); + if (start_call + j < calls.get(cid).start+k) { + seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start_call+j); } else { seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,calls.get(cid).stop+s+j-base_jump); } @@ -2388,6 +2422,11 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, } } + if (best_sw2 > best_sw1) { + int debug = 0; + debug++; + } + if (best_sw1 >= len-sub_len || best_sw2 >= len-sub_len) { seqs.get<0>(cid).add(); int last = seqs.get<0>(cid).size() - 1; @@ -2411,12 +2450,12 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, seqs.get<0>(cid).get<3>(last) = flag; seqs.get<0>(cid).get<4>(last) = seed_pos; - std::cout << "READ: " << read_name << " " << seq2 << " seed " << bases_seq << " pos " << seed_pos << std::endl; + std::cout << "READ: " << read_name << " " << seq2 << " seed " << bases_seq << " SEQBASES: " << string_from_32bases_pattern(bases_seq) << " pos " << seed_pos << std::endl; goto found_read; } - } - } + // } + //} found_read:; } @@ -2515,7 +2554,7 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, int j = it->second & 0xffffffff; int i = it->second >> 32; - if (seqs.get<0>(i).get<3>(j) != flag) { + if ((flag & 0x20) == 0) { if (seqs.get<0>(i).get<0>(j).size() < len) { int old_len = seqs.get<0>(i).get<0>(j).size(); seqs.get<0>(i).get<0>(j).resize(len); @@ -2664,7 +2703,7 @@ int get_32bases_pattern_with_gap(int chrom, return 0; } -int add_32bases_pattern(int chrom, +size_t add_32bases_pattern(int chrom, int start, openfpm::vector<openfpm::vector<char>> & chr_sequences, tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, @@ -2686,10 +2725,46 @@ int add_32bases_pattern(int chrom, cel.pos_seed = start; calls.add(cel); - return 0; + return bases_pattern; } -int add_32bases_pattern_with_gap(int chrom, +size_t add_32bases_pattern_with_variability(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + char * database, + int call_id, + int padding_left, + int padding_right) { + + size_t bases_pattern; + int ret = get_32bases_pattern(chrom,start,chr_sequences,bases_pattern); + if (ret != 0) return -1; + + std::string str; + str += std::to_string(chrom); + ":" + std::to_string(start+1) + "-" + std::to_string(start+32+1); + + bcf_srs_t *sr = bcf_sr_init(); + + bcf_sr_set_regions(sr,str.c_str(),true); + bcf_sr_add_reader(sr,database); + + openfpm::vector<CNV_event_link> & calls = map[bases_pattern]; + + struct CNV_event_link cel; + cel.call_id = call_id; + cel.padding_left = padding_left; + cel.padding_right = padding_right; + cel.path = 0x0; + cel.pos_seed = start; + calls.add(cel); + + bcf_sr_destroy(sr); + + return bases_pattern; +} + +size_t add_32bases_pattern_with_gap(int chrom, int start, openfpm::vector<openfpm::vector<char>> & chr_sequences, tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, @@ -2713,7 +2788,7 @@ int add_32bases_pattern_with_gap(int chrom, cel.pos_seed = start; calls.add(cel); - return 0; + return bases_pattern; } void get_sequence(openfpm::vector<openfpm::vector<char>> & reference, short int chr , int start, int stop, int p2, int p3, char * seq) { @@ -2774,9 +2849,9 @@ void get_sequence(openfpm::vector<openfpm::vector<char>> & reference, short int double calculate_event_probability_between_to_sequences(char * seq1, char * seq2, - int len, double reading_error) { - double align_probability = 1.0; + double align_probability = 0.0; + int len = strlen(seq1); for(size_t j=0;j<len;j++) { align_probability += (seq1[j] == seq2[j])?(log(1.0-reading_error)):log(reading_error); @@ -2785,34 +2860,46 @@ double calculate_event_probability_between_to_sequences(char * seq1, return align_probability; } +double insert_probability(int ins, openfpm::vector<double> & bp_insert_prob) { + if (abs(ins) < bp_insert_prob.size()) { + return bp_insert_prob.get(abs(ins)); + } + 1.7e-8; +} + double calculate_event_probability(char * seq1, char * seq2, char * seq3, - int len, int expected_insert_size, short int chr_int, int pos, double reading_error, openfpm::vector<int> & chr_ins, - openfpm::vector<openfpm::vector<char>> & reference) { + openfpm::vector<openfpm::vector<char>> & reference, + openfpm::vector<double> & bp_insert_prob) { char seq4[16384]; - double align_probability = calculate_event_probability_between_to_sequences(seq1,seq2,len,reading_error); + double align_probability = calculate_event_probability_between_to_sequences(seq1,seq2,reading_error); + + // get the sequence + int len = strlen(seq1); int max_i; double max = -1.0e300; for (int i = -4*chr_ins.get(chr_int) ; i < 4*chr_ins.get(chr_int) ; i++) { - // get the sequence + get_sequence(reference,chr_int,pos+i,pos+i+len,-1,-1,seq4); if (expected_insert_size != -1) { - double event_p = log(poisson_distribution(expected_insert_size,abs(i))) + calculate_event_probability_between_to_sequences(seq4,seq3,len,reading_error); + // poisson_distribution(expected_insert_size,abs(i)) + double event_p = log(insert_probability(abs(i),bp_insert_prob)) + calculate_event_probability_between_to_sequences(seq4,seq3,reading_error); if (event_p > max) { max = event_p; max_i = i; } } else { - double event_p = log(poisson_distribution(chr_ins.get(chr_int),abs(i))) + calculate_event_probability_between_to_sequences(seq4,seq3,len,reading_error); + // poisson_distribution(chr_ins.get(chr_int),abs(i)) + double event_p = log(insert_probability(abs(i),bp_insert_prob)) + calculate_event_probability_between_to_sequences(seq4,seq3,reading_error); if (event_p > max) { max = event_p; max_i = i; @@ -2820,8 +2907,7 @@ double calculate_event_probability(char * seq1, } } - std::cout << "MAX: " << max << " POS: " << pos << " i: " << max_i << std::endl; - return align_probability + max / 10.0; + return align_probability + max; } struct seed_pos{ @@ -2839,6 +2925,7 @@ struct seed_id{ } }; + void get_reference_point(openfpm::vector<seed_id> & seed, openfpm::vector<openfpm::vector<char>> & reference, openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> & un_seed) { @@ -2847,7 +2934,7 @@ void get_reference_point(openfpm::vector<seed_id> & seed, int latest_n_base = 0; size_t seq = 0; for (int i = 0 ; i < reference.size() ; i++) { - for (int j = 0 ; j < reference.get(i).size() ; j++) { + for (int j = 0 ; j < 2*reference.get(i).size() ; j++) { char base = get_base_from_reference(reference,i,j); if (base == 'N'){ @@ -2879,6 +2966,7 @@ void get_reference_point(openfpm::vector<seed_id> & seed, if (seed.get(k).seed == seq) { for (int m = 0 ; m < seed.get(k).calls.size() ; m++) { int cid = seed.get(k).calls.get(m).cid; + std::cout << "SEED C: " << i << " POS: " << j << " " << k << " CALL ID: " << cid << " " << string_from_32bases_pattern(seq) << std::endl; un_seed.get(cid).add(); int last = un_seed.get(cid).size()-1; un_seed.get(cid).get<0>(last) = i; @@ -2893,7 +2981,6 @@ void get_reference_point(openfpm::vector<seed_id> & seed, } } - Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & reference, const Pair_read_type & seq, openfpm::vector<aggregate<short int, int, int>> & un_seed, @@ -2902,15 +2989,16 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe short int chr, int pos, int distance_from_seed, - openfpm::vector<int> & chr_ins) { + openfpm::vector<int> & chr_ins, + openfpm::vector<double> & bp_insert_prob) { Eigen::ArrayXd events; events.resize((2*padding+1)*(2*padding+1) + un_seed.size()); // Event 1 reference char seq1[16384]; - char seq2[16384]; - char seq3[16384]; + char seq2[16384]; // foward read + char seq3[16384]; // reverse read for (int i = 0 ; i < seq.size() ; i++) { seq2[i] = seq.get<0>(i); @@ -2932,12 +3020,16 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe // Event 2 break - for (int i = -padding ; i <= padding ; i++) { - for (int j = -padding ; j <= padding ; j++) { + int i = 0; + int j = 0; + +// for (int i = -padding ; i <= padding ; i++) { +// for (int j = -padding ; j <= padding ; j++) { int id = (i+padding)*(2*padding+1) + (j+padding); get_sequence(reference,chr,pos,pos+seq.size()+(call.stop+j-call.start-i),call.start+i,call.stop+j,seq1); std::cout << "--------------- " << i << "," << j << " --------------------------------- event: " << id << std::endl; - events(id) = calculate_event_probability(seq2,seq1,seq3,seq.size(),call.stop-call.start,chr,pos,3e-3,chr_ins,reference); + events(id) = calculate_event_probability(seq2,seq1,seq3,call.stop-call.start,chr,pos,3e-3,chr_ins,reference,bp_insert_prob); + std::cout << "EVENT ID: " << id << std::endl; // Print @@ -2960,19 +3052,25 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe std::cout << std::endl; std::cout << " Probability: " << events(id) << std::endl; - } - } + + // Create a file + +// } +// } // Event 3 unaligned // for all unaligned points (max 100) for (int i = 0 ; i < un_seed.size() ; i++) { - get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i)-distance_from_seed+1,un_seed.get<1>(i)-distance_from_seed+1+seq.size(),-1,-1,seq1); + get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i),un_seed.get<1>(i)+32,-1,-1,seq1); + std::cout << "UNALIGNED SEED: " << std::string(seq1) << std::endl; + + get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i)-distance_from_seed,un_seed.get<1>(i)-distance_from_seed+seq.size(),-1,-1,seq1); int id = (2*padding+1)*(2*padding+1) + i; std::cout << "------------------- un_seed: " << i << " ----------------------------- event: " << id << std::endl; - events(id) = calculate_event_probability(seq2,seq1,seq3,seq.size(),-1,chr,un_seed.get<1>(i)-distance_from_seed+1,3e-3,chr_ins,reference); + events(id) = calculate_event_probability(seq2,seq1,seq3,-1,chr,un_seed.get<1>(i)-distance_from_seed+1,3e-3,chr_ins,reference,bp_insert_prob); - std::cout << "SEED position: " << un_seed.get<1>(i) << std::endl; + std::cout << id << " SEED position: " << un_seed.get<1>(i) << " SEED ID: " << un_seed.get<2>(i) << std::endl; // Print @@ -3041,7 +3139,8 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, openfpm::vector<aggregate<short int, int, int>> & un_seed, call_vcf & call, int padding, - openfpm::vector<int> & chr_ins) { + openfpm::vector<int> & chr_ins, + openfpm::vector<double> & bp_insert_prob) { // User viterbi algorithm to get the sequence of events // There are 6 events @@ -3111,7 +3210,7 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, // so the formula delta = Initial*Evidence_matrix in logaritm space become // delta_log = log(Initial*Evidemce_matrix) // delta_log = Initial_log + Evidence_matrix_log - delta_log.get(0) = Initial_log.array() + Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins); + delta_log.get(0) = Initial_log.array() + Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins,bp_insert_prob); int x; auto mC = delta_log.get(0).maxCoeff(&x); std::cout << "BEST event: " << x << " " << mC << std::endl; @@ -3169,7 +3268,7 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, int x; double coeff = next_transition.maxCoeff(&x); - Eigen::ArrayXd eml = Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins); + Eigen::ArrayXd eml = Evidence_matrix_log(reference,seqs.get<0>(s),un_seed,call,padding,seqs.get<1>(s),seqs.get<2>(s),seed_pos,chr_ins,bp_insert_prob); auto mC = eml.maxCoeff(&x); std::cout << "Best event: " << x << " " << mC << std::endl; std::cout << "Next transition: " << eml << std::endl; @@ -3198,6 +3297,26 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, } } +void convert_seed_map_to_array(openfpm::vector<seed_id> & seed, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + openfpm::vector<call_vcf> & calls) { + for (auto i = map.begin() ; i != map.end() ; i++) { + seed.add(); + int last = seed.size() - 1; + seed.get(last).seed = i->first; + for (int j = 0 ; j < i->second.size() ; j++) { + int cid = i->second.get(j).call_id; + + struct seed_pos tmp; + tmp.cid = cid; + tmp.chr = calls.get(i->second.get(j).call_id).chromosome;; + tmp.pos = i->second.get(j).pos_seed;; + + seed.get(last).calls.add(tmp); + } + } +} + void CNV_event_test(openfpm::vector<call_vcf> & calls, openfpm::vector<openfpm::vector<char>> & reference, openfpm::vector<std::string> & files_bams, @@ -3207,6 +3326,8 @@ void CNV_event_test(openfpm::vector<call_vcf> & calls, char seq2[16384]; char seq3[16384]; + openfpm::vector<double> bp_insert_prob; + bp_insert_prob.load("insert_probability_distribution"); tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> map; map.reserve(calls.size()*2); @@ -3271,21 +3392,7 @@ void CNV_event_test(openfpm::vector<call_vcf> & calls, openfpm::vector<seed_id> seed; openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> un_seed; - for (auto i = map.begin() ; i != map.end() ; i++) { - seed.add(); - int last = seed.size() - 1; - seed.get(last).seed = i->first; - for (int j = 0 ; j < i->second.size() ; j++) { - int cid = i->second.get(j).call_id; - - struct seed_pos tmp; - tmp.cid = cid; - tmp.chr = calls.get(i->second.get(j).call_id).chromosome;; - tmp.pos = i->second.get(j).pos_seed;; - - seed.get(last).calls.add(tmp); - } - } + convert_seed_map_to_array(seed,map,calls); un_seed.resize(calls.size()); get_reference_point(seed,reference,un_seed); @@ -3315,7 +3422,7 @@ void CNV_event_test(openfpm::vector<call_vcf> & calls, for (int i = 0 ; i < seqs.size(); i++) { for (int j = 0 ; j < seqs.get<0>(i).size() ; j++) { - CalculateViterbi(reference,seqs.get<0>(i).get<0>(j),un_seed.get(j),calls.get(i),padding,chr_ins.get<0>(i)); + CalculateViterbi(reference,seqs.get<0>(i).get<0>(j),un_seed.get(j),calls.get(i),padding,chr_ins.get<0>(i),bp_insert_prob); } } } diff --git a/example/Sequencing/CNVcaller/testing.cpp b/example/Sequencing/CNVcaller/testing.cpp index ffbe4a0ace52a21afcfa3a6975c44cb9a22d597d..1a1c67dbc9aa7b32ce8e82fb6b058774ef9495fb 100644 --- a/example/Sequencing/CNVcaller/testing.cpp +++ b/example/Sequencing/CNVcaller/testing.cpp @@ -49,6 +49,72 @@ BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern ) BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); } +BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern_with_variability ) +{ + openfpm::vector<openfpm::vector<char>> reference; + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + reference.load("reference_vector"); + + // chromosome 0 + add_32bases_pattern_with_variability(0,195452030,reference,map,"test/gnomad_variants.vcf.gz",0,-1,NO_PADDING); + + BOOST_REQUIRE_EQUAL(map.size(),1); + auto i = map.begin(); + BOOST_REQUIRE_EQUAL(i->first >> 62,0); + BOOST_REQUIRE_EQUAL(i->first & 0x3,3); + + BOOST_REQUIRE_EQUAL(i->second.size(),1); + + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_left, -1); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); + + map.clear(); + + add_32bases_pattern_with_variability(0,195452030,reference,map,"test/gnomad_variants.vcf.gz",1,-2,NO_PADDING); + + BOOST_REQUIRE_EQUAL(map.size(),1); + i = map.begin(); + BOOST_REQUIRE_EQUAL(i->first >> 62,3); + BOOST_REQUIRE_EQUAL(i->first & 0x03,2); + + + + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_left, -2); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_test_32_bases_to_string ) +{ + openfpm::vector<openfpm::vector<char>> reference; + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + reference.load("reference_vector"); + + // chromosome 0 + add_32bases_pattern(0,44605910,reference,map,0,-1,NO_PADDING); + + auto i = map.begin(); + + std::string bases = CNVcaller::string_from_32bases_pattern(i->first); + char seq[256]; + CNVcaller::get_sequence(reference,0,44605910,44605942,-1,-1,seq); + std::string seq_str(seq); + + BOOST_REQUIRE_EQUAL(bases,seq_str); + + map.clear(); + add_32bases_pattern(0,44605911,reference,map,1,-2,NO_PADDING); + + i = map.begin(); + + bases = CNVcaller::string_from_32bases_pattern(i->first); + CNVcaller::get_sequence(reference,0,44605911,44605943,-1,-1,seq); + seq_str = std::string(seq); + + BOOST_REQUIRE_EQUAL(bases,seq_str); + +} BOOST_AUTO_TEST_CASE( CNV_caller_test_get_Nbases_pattern ) { @@ -313,30 +379,350 @@ BOOST_AUTO_TEST_CASE( CNV_caller_test_collect_and_test_read_multiple_read_multip reference.load("reference_vector"); openfpm::vector<CNVcaller::call_vcf> calls_to_test; - CNVcaller::read_CNV_hypothesys("test/test_multiple_CNV.vcf.gz",calls_to_test); + CNVcaller::read_CNV_hypothesys("test/test_multiple_CNV_8shift.vcf.gz",calls_to_test); tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; All_samples_call_related_relevant_read_type seqs; // remeber is 0-based index, call id is 0, padding between call-start and seed is -1 int offset = 0; - add_32bases_pattern(2,112743114-32,reference,map,0,0,NO_PADDING); - add_32bases_pattern(2,112751315,reference,map,0,0,NO_PADDING); - add_32bases_pattern(2,195451990-32,reference,map,1,0,NO_PADDING); - add_32bases_pattern(3,64513-32,reference,map,2,0,NO_PADDING); + add_32bases_pattern(2,112743123-32,reference,map,0,0,NO_PADDING); + // padding from the start of the call to the end of the seed is 112751315-112743114+32 + add_32bases_pattern(2,112751322,reference,map,0,112751322-112743123+32,NO_PADDING); + CNVcaller::add_32bases_pattern_with_gap(2, // Chromosome 3 + calls_to_test.get(0).start-16, // position where to take the seed + reference, // reference + map, // hashmap where to add the seed + calls_to_test.get(0).start, // starting point of the gap + calls_to_test.get(0).stop, // end point of the gap + 0, // call id + 16, // padding of the end of the seed from the start of the call (without considering the gap) + NO_PADDING); + add_32bases_pattern(2,195451991-32,reference,map,1,0,NO_PADDING); + // call start at 645514 + add_32bases_pattern(3,64508-32,reference,map,2,-5,NO_PADDING); seqs.resize(3); - std::string test_bam("/nvme/bams/batch_1/DE78NGSUKBD134462_76322.bam"); + std::string test_bam("test/HG002_3calls_reads_cnv.bam"); collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6,0); - // Here we should have one read collected for call zero + // Call 0 has 3 reads paired + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).size(),3); + + // 4 reads for call zero + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),4); + + // 4 read for call 0 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(0).size(),101); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(1).size(),99); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(2).size(),100); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(3).size(),101); + + // 1 reads for call 1 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(1).size(),1); + + // 1 read for call 1 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(1).get<0>(0).size(),101); + + // 0 reads for call 2 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(2).size(),0); + + // we relax to 1 base missmatch + + seqs.resize(0); + seqs.resize(3); + + collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6,1); + + // Call 0 has 3 reads paired + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).size(),3); + + // 4 reads for call zero + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),4); + + // 4 read for call 0 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(0).size(),101); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(1).size(),99); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(2).size(),100); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(3).size(),101); + + // 1 reads for call 1 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(1).size(),3); + + // 1 read for call 1 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(1).get<0>(0).size(),101); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(1).get<0>(1).size(),101); + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(1).get<0>(2).size(),101); + + // 0 reads for call 2 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(2).size(),1); + + // 0 read for call 2 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(2).get<0>(0).size(),101); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_tests_calculate_event_probability_between_to_sequences ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_multiple_CNV_8shift.vcf.gz",calls_to_test); + + // chromosome 3 position 112743032 + int chr = 2; + int pos = 112743032; + + auto call = calls_to_test.get(0); + + char seq1[16384]; + char seq2[] = "ATTGGTGACAGAGAGATGCAGGATCAAGAAGAGTATTGTTTTGTTTTGTTCTGATTTTAAGGATAGTTTACTAGTCAGGATTCCACCAGCCTGTAGGGGTG"; + + CNVcaller::get_sequence(reference,chr,pos,pos+sizeof(seq2)+(call.stop-call.start),call.start,call.stop,seq1); + + double probability = CNVcaller::calculate_event_probability_between_to_sequences(seq2,seq1,3e-3); + BOOST_REQUIRE_CLOSE(-0.30345541,probability,0.01); + + char seq4[] = "ATTGGTGACAGAGAGATGCAGGATCAAGAAGAGTATTGTTTTGTTTTGTTCTTATTTTAAGGATAGTTTACTAGTCAGGATTCCACCAGCCTGTAGGGGTG"; + probability = CNVcaller::calculate_event_probability_between_to_sequences(seq4,seq1,3e-3); + BOOST_REQUIRE_CLOSE(-6.10959389234,probability,0.01); + + std::cout << "Probability: " << probability << std::endl; +} + +BOOST_AUTO_TEST_CASE( CNV_caller_tests_calculate_event_probability ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<double> bp_insert_prob; + bp_insert_prob.load("insert_probability_distribution"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_multiple_CNV_8shift.vcf.gz",calls_to_test); + + // chromosome 3 position 112743032 + int chr = 2; + int pos = 112743032; + + auto call = calls_to_test.get(0); + + char seq1[16384]; + char seq2[] = "ATTGGTGACAGAGAGATGCAGGATCAAGAAGAGTATTGTTTTGTTTTGTTCTGATTTTAAGGATAGTTTACTAGTCAGGATTCCACCAGCCTGTAGGGGTG"; + char seq3[] = "ATGCAGGATCAAGAAGAGTATTGTTTTGTTTTGTTCTGATTTTAAGGATAGTTTACTAGTCAGGATTCCACCAGCCTGTAGGGGTGGGTTGCCCCTACACA"; + + CNVcaller::get_sequence(reference,chr,pos,pos+sizeof(seq2)+(call.stop-call.start),call.start,call.stop,seq1); + + + openfpm::vector<int> chr_ins; + chr_ins.resize(23); + + for (int i = 0 ; i < chr_ins.size(); i++) { + chr_ins.get(i) = 210; + } + + double probability = CNVcaller::calculate_event_probability(seq2,seq1,seq3,-1,chr,pos,3e-3,chr_ins,reference,bp_insert_prob); + + BOOST_REQUIRE_CLOSE(probability,-13.923319055604425,0.001); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_evidence_insert_size_for_sample ) +{ + openfpm::vector<int> bp_insert; + openfpm::vector<double> bp_insert_prob; + + char bam_file[] = "/nvme/bams/batch_1/DE78NGSUKBD134462_76322.bam"; + + samFile *fp_in = hts_open(bam_file,"r"); //open bam file + + if (fp_in == 0x0) { + std::cerr << __FILE__ << ":" << __LINE__ << " Error opening the file " << bam_file << std::endl; + return; + } + + size_t tot_pair = 0; + + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header + bam1_t *aln = bam_init1(); //initialize an alignment + + while(sam_read1(fp_in,bamHdr,aln) > 0){ + + if (aln->core.tid == -1) + {continue;} + + char *chr = bamHdr->target_name[aln->core.tid]; + + short int chr_int = CNVcaller::chr_to_chr_int(chr); + if (chr_int == -1) {continue;} + + if (aln->core.isize < 0) {continue;} + + if (abs(aln->core.isize) >= bp_insert.size() ) { + int prev_size = bp_insert.size(); + bp_insert.resize(abs(aln->core.isize)+1); + for (int i = prev_size ; i < bp_insert.size() ; i++) { + bp_insert.get(i) = 0; + } + } + bp_insert.get(abs(aln->core.isize)) += 1; + tot_pair++; + } + + bam_destroy1(aln); + sam_close(fp_in); + + bp_insert_prob.resize(bp_insert.size()); + + for (int i = 0 ; i < bp_insert.size() ; i++) { + if (bp_insert.get(i) == 0) { + bp_insert_prob.get(i) = 1.0 / tot_pair; + } else { + bp_insert_prob.get(i) = (double)bp_insert.get(i) / tot_pair; + } + } + + bp_insert_prob.save("insert_probability_distribution"); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_evidence_probability_for_each_event ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_multiple_CNV_8shift.vcf.gz",calls_to_test); + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + All_samples_call_related_relevant_read_type seqs; + + // remeber is 0-based index, call id is 0, padding between call-start and seed is -1 + int offset = 0; + // seed 46... + size_t a = add_32bases_pattern(2,112743123-32,reference,map,0,0,NO_PADDING); + std::cout << "Seed call 0: " << CNVcaller::string_from_32bases_pattern(a) << std::endl; + + // seed 133... + // padding from the start of the call to the end of the seed is 112751315-112743114+32 + a = add_32bases_pattern(2,112751322,reference,map,0,112751322-112743123+32,NO_PADDING); + std::cout << "Seed call 0: " << CNVcaller::string_from_32bases_pattern(a) << std::endl; + + // seed + a = CNVcaller::add_32bases_pattern_with_gap(2, // Chromosome 3 + calls_to_test.get(0).start-16, // position where to take the seed + reference, // reference + map, // hashmap where to add the seed + calls_to_test.get(0).start, // starting point of the gap + calls_to_test.get(0).stop, // end point of the gap + 0, // call id + 16, // padding of the end of the seed from the start of the call (without considering the gap) + NO_PADDING); + std::cout << "Seed call 0: " << CNVcaller::string_from_32bases_pattern(a) << std::endl; + + // call id 1 + a = add_32bases_pattern(2,195451991-32,reference,map,1,0,NO_PADDING); + std::cout << "Seed call 1: " << CNVcaller::string_from_32bases_pattern(a) << std::endl; + a = add_32bases_pattern(2,195452447,reference,map,1,195452447-195451991+32,NO_PADDING); + std::cout << "Seed call 1: " << CNVcaller::string_from_32bases_pattern(a) << std::endl; + // call id 2 call start at 645514 + a = add_32bases_pattern(3,64508-32,reference,map,2,-5,NO_PADDING); + std::cout << "Seed call 2: " << CNVcaller::string_from_32bases_pattern(a) << std::endl; + + + seqs.resize(3); + + std::string test_bam("test/HG002_3calls_reads_cnv_sort.bam"); + collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6,3); + + // We check that all read are coupled, for all elements in the map + + for (int i = 0 ; i < seqs.size() ; i++) { + for (int j = 0 ; j < seqs.get<0>(i).size() ; j++) { + for (int k = 0 ; k < seqs.get<0>(i).get<0>(j).size() ; k++) { + auto & pair = seqs.get<0>(i).get<0>(j).get<0>(k); + BOOST_REQUIRE((pair.get<1>(3) == 'A' || pair.get<1>(3) == 'C' || pair.get<1>(3) == 'T' || pair.get<1>(3) == 'G')); + } + } + } + + openfpm::vector<CNVcaller::seed_id> seed; + convert_seed_map_to_array(seed,map,calls_to_test); + + // Find all reference points + openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> un_seed; +// un_seed.resize(calls_to_test.size()); +// get_reference_point(seed,reference,un_seed); + un_seed.load("test/unseed_3calls_cnv"); + + + // Calculate evidence for each of the read pair + + int seed_pos = seqs.get<0>(0).get<0>(0).get<4>(0); + openfpm::vector<int> chr_ins; + chr_ins.resize(23); + + for (int i = 0 ; i < chr_ins.size(); i++) { + chr_ins.get(i) = 210; + } + + // Load insert size probability + + openfpm::vector<double> bp_insert_prob; + bp_insert_prob.load("insert_probability_distribution"); + + // Get the seed number associated to the call + int seed_num = 0; + + for (int i = 0 ; i < seqs.get<0>(0).get<0>(0).size() ; i++) { + + seed_pos = seqs.get<0>(0).get<0>(0).get<4>(i); + + Eigen::ArrayXd eml3 = Evidence_matrix_log(reference, + seqs.get<0>(0).get<0>(0).get<0>(i), + un_seed.get(0), + calls_to_test.get(0), + -6, + seqs.get<0>(0).get<0>(0).get<1>(i), + seqs.get<0>(0).get<0>(0).get<2>(i), + seed_pos, + chr_ins, + bp_insert_prob); + + BOOST_REQUIRE(eml3(59) > -20.0 ); + BOOST_REQUIRE(eml3(126) > -20.0); + } + + char seq[16384]; + CNVcaller::get_sequence(reference,2,195451991,195451991+100,-1,-1,seq); + + std::cout << seq << std::endl; + + // For rach read + for (int j = 0 ; j < seqs.get<0>(0).size() ; j++) { + for (int i = 0 ; i < seqs.get<0>(0).get<0>(0).size() ; i++) { + + seed_pos = seqs.get<0>(0).get<0>(j).get<4>(i); + + Eigen::ArrayXd eml3 = Evidence_matrix_log(reference, + seqs.get<0>(0).get<0>(j).get<0>(i), + un_seed.get(0), + calls_to_test.get(j), + -6, + seqs.get<0>(0).get<0>(j).get<1>(i), + seqs.get<0>(0).get<0>(j).get<2>(i), + seed_pos, + chr_ins, + bp_insert_prob); + + std::cout << "Probability: " << eml3 << std::endl; + + BOOST_REQUIRE(eml3(59) > -20.0 ); + BOOST_REQUIRE(eml3(126) > -20.0); + } + } - BOOST_REQUIRE_EQUAL(seqs.get<0>(0).size(),1); - // The read is not relevant for the reference - BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),1); } BOOST_AUTO_TEST_SUITE_END()