diff --git a/example/Sequencing/CNVcaller/functions.hpp b/example/Sequencing/CNVcaller/functions.hpp index 59210a3c30bf0997830ec5055b7ba7060641c100..0de46a936d11b6843533d07117a03d9efd3a3267 100644 --- a/example/Sequencing/CNVcaller/functions.hpp +++ b/example/Sequencing/CNVcaller/functions.hpp @@ -21,6 +21,15 @@ constexpr int start_interval = 1; constexpr int stop_interval = 2; constexpr int Num_bases = 3; +// NO PADDING integer +constexpr size_t NO_PADDING=0x7FFFFFFF; + +typedef openfpm::vector<aggregate<char,char,char>> Pair_read_type; +typedef aggregate<Pair_read_type,short int, int, int, int> Relevant_read_type; +typedef openfpm::vector<Relevant_read_type> Call_related_relevant_read_type; +typedef openfpm::vector<aggregate<Call_related_relevant_read_type>> Sample_call_related_relevant_read_type; +typedef openfpm::vector<aggregate<Sample_call_related_relevant_read_type>> All_samples_call_related_relevant_read_type; + namespace CNVcaller { @@ -2145,8 +2154,8 @@ void read_CNV_hypothesys(const std::string & filename, openfpm::vector<call_vcf> struct call_vcf vcfc; vcfc.chromosome = chr_to_chr_int(bcf_hdr_id2name(header, record->rid)); - vcfc.start = record->pos-1; // Convert to 0 based index - vcfc.stop = record->pos + record->rlen-1; + vcfc.start = record->pos; // Convert to 0 based index + vcfc.stop = record->pos + record->rlen; vcfc.CN = 1; cnvs_to_test.add(vcfc); @@ -2156,8 +2165,8 @@ void read_CNV_hypothesys(const std::string & filename, openfpm::vector<call_vcf> struct call_vcf vcfc; vcfc.chromosome = chr_to_chr_int(bcf_hdr_id2name(header, record->rid)); - vcfc.start = record->pos - 1; // Convert to 0 based index - vcfc.stop = record->pos + record->rlen-1; + vcfc.start = record->pos; // Convert to 0 based index + vcfc.stop = record->pos + record->rlen; vcfc.CN = 3; cnvs_to_test.add(vcfc); @@ -2231,12 +2240,10 @@ char get_base_from_reference(openfpm::vector<openfpm::vector<char>> & reference, void collect_reads_for_call_evidence_forward(std::string & bam_file, tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, openfpm::vector<call_vcf> & calls, - openfpm::vector<aggregate< // Calls - openfpm::vector<aggregate< // Sequences - openfpm::vector<aggregate<char,char>>,short int,int,int,int> - > - >> & seqs, - openfpm::vector<openfpm::vector<char>> & reference) { + Sample_call_related_relevant_read_type & seqs, + openfpm::vector<openfpm::vector<char>> & reference, + int padding, + int sub_len) { int n_reads = 0; @@ -2247,6 +2254,12 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, int actual_chr = -1; samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + + if (fp_in == 0x0) { + std::cerr << __FILE__ << ":" << __LINE__ << " Error opening the file " << bam_file << std::endl; + return; + } + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header bam1_t *aln = bam_init1(); //initialize an alignment @@ -2282,28 +2295,7 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, bases_seq |= (base == 'T')?0x02:0x00; bases_seq |= (base == 'G')?0x03:0x00; -/* if (bases_seq == 0x3e79e6d9944cfe6f) { - int debug = 0; - debug++; - } - } - - bases_seq = 0; - int i = 0; - for(i=len-1; i>0 ; i--){ - bases_seq = bases_seq << 2; - char base = seq_nt16_str[bam_seqi(q,i)]; - bases_seq |= (base == 'A')?0x00:0x00; - bases_seq |= (base == 'C')?0x01:0x00; - bases_seq |= (base == 'T')?0x02:0x00; - bases_seq |= (base == 'G')?0x03:0x00; - - if (bases_seq == 0x3e79e6d9944cfe6f) { - int debug = 0; - debug++; - }*/ - - if (i >= 32) { + if (i >= 31) { auto fnd = map.find(bases_seq); if (fnd != map.end()) { @@ -2319,8 +2311,9 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, // Check the pattern if (fnd->second.get(v).path == 0) { // check the reference - int start = calls.get(cid).start + fnd->second.get(v).padding_left-i; - int seed_pos = i-32; + std::cout << calls.get(cid).start << " " << fnd->second.get(v).padding_left << " " << i << std::endl; + int start = calls.get(cid).start + fnd->second.get(v).padding_left-i-1; + int seed_pos = i-31; int call_distance_from_start = calls.get(cid).start - start; if (call_distance_from_start > len) {continue;} @@ -2335,75 +2328,97 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, for (int j = 0 ; j < len ; j++) { seq2[j] = seq_nt16_str[bam_seqi(q,j)]; seq1[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start+j); - if (start + j < calls.get(cid).start) { - seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start+j); - } else { - seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,calls.get(cid).stop+j-call_distance_from_start); - } } + seq1[len] = 0; seq2[len] = 0; - seq3[len] = 0; - - SequentialImplementation sw1(seq1,seq2,1,0,-1); - SequentialImplementation sw2(seq1,seq3,1,0,-1); + SequentialImplementation sw1(seq2,seq1,1,0,-1); sw1.runAlgorithm(); - sw2.runAlgorithm(); - - int best_sw1 = 0; - { - auto & results = sw1.getResults(); - - for(size_t j=0;j<results[0].size();j++){ - // std::cout << "Match " << j+1 << " [Score: " << results[0][j].score << ", Start: " << results[0][j].start << ", Stop: " << results[0][j].stop << "]" << endl; - // std::cout << " D: " << results[0][j].result_pair.d << endl; - // std::cout << " Q: " << results[0][j].result_pair.q << endl; - // std::cout << "SCORE: " << results[0][j].score << std::endl; - if (results[0][j].score > best_sw1) { - best_sw1 = results[0][j].score; + + for (int k = -padding ; k <= padding ; k++) { + for (int s = -padding ; s <= padding ; s++) { + + if (k == -1 && s == 0) { + int debug = 0; + debug++; } - } - } - int best_sw2 = 0; - { - auto & results = sw2.getResults(); - - for(size_t j=0;j<results[0].size();j++){ - // std::cout << "Match " << j+1 << " [Score: " << results[0][j].score << ", Start: " << results[0][j].start << ", Stop: " << results[0][j].stop << "]" << endl; - // std::cout << " D: " << results[0][j].result_pair.d << endl; - // std::cout << " Q: " << results[0][j].result_pair.q << endl; - // std::cout << "SCORE: " << results[0][j].score << std::endl; - if (results[0][j].score > best_sw2) { - best_sw2 = results[0][j].score; + int base_jump = calls.get(cid).start+k - start; + for (int j = 0 ; j < len ; j++) { + if (start + j < calls.get(cid).start+k) { + seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,start+j); + } else { + seq3[j] = get_base_from_reference(reference,calls.get(cid).chromosome,calls.get(cid).stop+s+j-base_jump); + } } - } - } - if (best_sw1 > len-3 || best_sw2 > len-3) { - seqs.get<0>(cid).add(); - int last = seqs.get<0>(cid).size() - 1; - seqs.get<0>(cid).get<0>(last).resize(len); - for (int s = 0 ; s < len ; s++) { - seqs.get<0>(cid).get<0>(last).template get<0>(s) = seq2[s]; - } + seq3[len] = 0; - if (seqs.get<0>(cid).get<0>(last).size() < aln->core.l_qname) { - seqs.get<0>(cid).get<0>(last).resize(aln->core.l_qname); - } - for (int s = 0 ; s < aln->core.l_qname ; s++) { - seqs.get<0>(cid).get<0>(last).template get<1>(s) = read_name[s]; - } - seqs.get<0>(cid).get<0>(last).template get<1>(aln->core.l_qname) = 0; + SequentialImplementation sw2(seq2,seq3,1,0,-1); + sw2.runAlgorithm(); - seqs.get<0>(cid).get<1>(last) = chr_int; - seqs.get<0>(cid).get<2>(last) = start; - seqs.get<0>(cid).get<3>(last) = flag; - seqs.get<0>(cid).get<4>(last) = seed_pos; + int best_sw1 = 0; + { + auto & results = sw1.getResults(); + + for(size_t j=0;j<results[0].size();j++){ + // std::cout << "Match " << j+1 << " [Score: " << results[0][j].score << ", Start: " << results[0][j].start << ", Stop: " << results[0][j].stop << "]" << endl; + // std::cout << " D: " << results[0][j].result_pair.d << endl; + // std::cout << " Q: " << results[0][j].result_pair.q << endl; + // std::cout << "SCORE: " << results[0][j].score << std::endl; + if (results[0][j].score > best_sw1) { + best_sw1 = results[0][j].score; + } + } + } + + int best_sw2 = 0; + { + auto & results = sw2.getResults(); + + for(size_t j=0;j<results[0].size();j++){ + // std::cout << "Match " << j+1 << " [Score: " << results[0][j].score << ", Start: " << results[0][j].start << ", Stop: " << results[0][j].stop << "]" << endl; + // std::cout << " D: " << results[0][j].result_pair.d << endl; + // std::cout << " Q: " << results[0][j].result_pair.q << endl; + // std::cout << "SCORE: " << results[0][j].score << std::endl; + if (results[0][j].score > best_sw2) { + best_sw2 = results[0][j].score; + } + } + } - std::cout << "READ: " << read_name << " " << seq2 << " seed " << bases_seq << " pos " << seed_pos << std::endl; + if (best_sw1 >= len-sub_len || best_sw2 >= len-sub_len) { + seqs.get<0>(cid).add(); + int last = seqs.get<0>(cid).size() - 1; + seqs.get<0>(cid).get<0>(last).resize(len); + for (int s = 0 ; s < len ; s++) { + seqs.get<0>(cid).get<0>(last).template get<0>(s) = seq2[s]; + } + + if (seqs.get<0>(cid).get<0>(last).size() < aln->core.l_qname) { + seqs.get<0>(cid).get<0>(last).resize(aln->core.l_qname); + } + for (int s = 0 ; s < aln->core.l_qname ; s++) { + seqs.get<0>(cid).get<0>(last).template get<1>(s) = read_name[s]; + seqs.get<0>(cid).get<0>(last).template get<2>(s) = read_name[s]; + } + seqs.get<0>(cid).get<0>(last).template get<1>(aln->core.l_qname) = 0; + seqs.get<0>(cid).get<0>(last).template get<2>(aln->core.l_qname) = 0; + + seqs.get<0>(cid).get<1>(last) = chr_int; + seqs.get<0>(cid).get<2>(last) = start; + seqs.get<0>(cid).get<3>(last) = flag; + seqs.get<0>(cid).get<4>(last) = seed_pos; + + std::cout << "READ: " << read_name << " " << seq2 << " seed " << bases_seq << " pos " << seed_pos << std::endl; + + goto found_read; + } + } } + + found_read:; } } @@ -2420,15 +2435,17 @@ void collect_reads_for_call_evidence_forward(std::string & bam_file, } void collect_reads_for_call_evidence_reverse(std::string & bam_file, - openfpm::vector<aggregate< // Calls - openfpm::vector<aggregate< // Sequences - openfpm::vector<aggregate<char,char>>,short int,int,int,int> - > - >> & seqs) { + Sample_call_related_relevant_read_type & seqs) { int actual_chr = -1; samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file + + if (fp_in == 0x0) { + std::cerr << __FILE__ << ":" << __LINE__ << " Error opening the file " << bam_file << std::endl; + return; + } + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header bam1_t *aln = bam_init1(); //initialize an alignment char * read_name = (char *)&aln->data[0]; @@ -2498,11 +2515,6 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, int j = it->second & 0xffffffff; int i = it->second >> 32; - // DEBUG - if (str == "A00684:261:HNJK2DRX2:1:2162:2880:9768") { - std::cout << "FOUND REVERSE: " << str << " i: " << i << " j:" << j << std::endl; - } - if (seqs.get<0>(i).get<3>(j) != flag) { if (seqs.get<0>(i).get<0>(j).size() < len) { int old_len = seqs.get<0>(i).get<0>(j).size(); @@ -2530,41 +2542,25 @@ void collect_reads_for_call_evidence_reverse(std::string & bam_file, void collect_reads_for_call_evidence(std::string & bam_file, tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, openfpm::vector<call_vcf> & calls/**/, - openfpm::vector< - aggregate< // Calls - openfpm::vector< - aggregate< // Sequences - openfpm::vector< - aggregate<char,char> - >, - short int, - int, - int, - int - > - > - > - > & seqs, - openfpm::vector<openfpm::vector<char>> & reference) { + Sample_call_related_relevant_read_type & seqs, + openfpm::vector<openfpm::vector<char>> & reference, + int padding, + int sub_len = 3) { // now we check all the read if they have a matching patterns seqs.resize(calls.size()); - collect_reads_for_call_evidence_forward(bam_file,map,calls,seqs,reference); + collect_reads_for_call_evidence_forward(bam_file,map,calls,seqs,reference,padding,sub_len); collect_reads_for_call_evidence_reverse(bam_file,seqs); // std::cout << "COMPLETED " << bam_file << std::endl; } - - -void add_32bases_pattern(int chrom, - int start, - openfpm::vector<openfpm::vector<char>> & chr_sequences, - tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, - int call_id, - int padding) { +int get_32bases_pattern(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + size_t & seed) { // Store the patterns related to the reference int chr = chrom; @@ -2577,7 +2573,7 @@ void add_32bases_pattern(int chrom, auto & seq = chr_sequences.get(chr); if (pos_rest != 0) { char base_bit = seq.get(pos) >> 4; - if (base_bit == 4) {return;} + if (base_bit == 4) {return -1;} bases_pattern |= base_bit; loop_size = 15; } @@ -2586,30 +2582,138 @@ void add_32bases_pattern(int chrom, for ( ; k < loop_size ; k++) { bases_pattern = bases_pattern << 2; char base_bit = seq.get(pos + k + (pos_rest != 0)) & 0x7; - if (base_bit == 4) {return;} + if (base_bit == 4) {return -1;} bases_pattern |= ((size_t)base_bit); base_bit = seq.get(pos + k + (pos_rest != 0)) >> 4; bases_pattern = bases_pattern << 2; - if (base_bit == 4) {return;} + if (base_bit == 4) {return -1;} bases_pattern |= ((size_t)base_bit); } if (pos_rest != 0) { bases_pattern = bases_pattern << 2; char base_bit = seq.get(pos + k + 1) & 0x7; - if (base_bit == 4) {return;} + if (base_bit == 4) {return -1;} bases_pattern |= ((size_t)base_bit); } + seed = bases_pattern; + return 0; +} + +int get_Nbases_pattern(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + int N, + size_t & seed) { + + // Store the patterns related to the reference + int chr = chrom; + int pos = start / 2; + int pos_rest = (start % 2)*2; + + size_t bases_pattern = seed; + + int n = 0; + int loop_size = std::ceil((float)N / 2); + auto & seq = chr_sequences.get(chr); + if (pos_rest != 0) { + bases_pattern = bases_pattern << 2; + char base_bit = seq.get(pos) >> 4; + if (base_bit == 4) {return -1;} + bases_pattern |= base_bit; + if (N % 2 == 1) {loop_size--;} + n++; + } + + int k = 0; + for ( ; k < loop_size && n < N ; k++ ) { + bases_pattern = bases_pattern << 2; + char base_bit = seq.get(pos + k + (pos_rest != 0)) & 0x7; + if (base_bit == 4) {return -1;} + bases_pattern |= ((size_t)base_bit); + n++; + if (n >= N) {break;} + base_bit = seq.get(pos + k + (pos_rest != 0)) >> 4; + bases_pattern = bases_pattern << 2; + if (base_bit == 4) {return -1;} + bases_pattern |= ((size_t)base_bit); + n++; + } + + seed = bases_pattern; + return 0; +} + +int get_32bases_pattern_with_gap(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + int start_gap, + int end_gap, + size_t & seed) { + + seed = 0; + + if (start_gap - start >= 32) { + get_32bases_pattern(chrom,start,chr_sequences,seed); + } + + get_Nbases_pattern(chrom,start,chr_sequences,start_gap-start,seed); + get_Nbases_pattern(chrom,end_gap,chr_sequences,32-(start_gap-start),seed); + + return 0; +} + +int add_32bases_pattern(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + int call_id, + int padding_left, + int padding_right) { + + size_t bases_pattern; + int ret = get_32bases_pattern(chrom,start,chr_sequences,bases_pattern); + if (ret != 0) return -1; + openfpm::vector<CNV_event_link> & calls = map[bases_pattern]; struct CNV_event_link cel; cel.call_id = call_id; - cel.padding_left = padding; - cel.padding_right = padding; + cel.padding_left = padding_left; + cel.padding_right = padding_right; cel.path = 0x0; cel.pos_seed = start; calls.add(cel); + + return 0; +} + +int add_32bases_pattern_with_gap(int chrom, + int start, + openfpm::vector<openfpm::vector<char>> & chr_sequences, + tsl::hopscotch_map<size_t, openfpm::vector<CNV_event_link>> & map, + int start_gap, + int end_gap, + int call_id, + int padding_left, + int padding_right) { + + size_t bases_pattern; + int ret = get_32bases_pattern_with_gap(chrom,start,chr_sequences,start_gap,end_gap,bases_pattern); + if (ret != 0) return -1; + + openfpm::vector<CNV_event_link> & calls = map[bases_pattern]; + + struct CNV_event_link cel; + cel.call_id = call_id; + cel.padding_left = padding_left; + cel.padding_right = padding_right; + cel.path = 0x0; + cel.pos_seed = start; + calls.add(cel); + + return 0; } void get_sequence(openfpm::vector<openfpm::vector<char>> & reference, short int chr , int start, int stop, int p2, int p3, char * seq) { @@ -2789,11 +2893,6 @@ void get_reference_point(openfpm::vector<seed_id> & seed, } } -typedef openfpm::vector<aggregate<char,char,char>> Pair_read_type; -typedef aggregate<Pair_read_type,short int, int, int, int> Relevant_read_type; -typedef openfpm::vector<Relevant_read_type> Call_related_relevant_read_type; -typedef openfpm::vector<aggregate<Call_related_relevant_read_type>> Sample_call_related_relevant_read_type; -typedef openfpm::vector<aggregate<Sample_call_related_relevant_read_type>> All_samples_call_related_relevant_read_type; Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & reference, const Pair_read_type & seq, @@ -2823,6 +2922,12 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe //get_sequence(reference,chr,pos,pos+seq.size(),-1,-1,seq1); //events(0) = calculate_event_probability(seq2,seq1,seq3,-1,chr,3e-3,chr_ins); + std::cout << "Read name: "; + for (int i = 0 ; i < seq.size() ; i++) { + std::cout << seq.get<2>(i); + } + std::cout << std::endl; + std::cout << "Distance from seed: " << distance_from_seed << std::endl; // Event 2 break @@ -2831,7 +2936,7 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe for (int j = -padding ; j <= padding ; j++) { int id = (i+padding)*(2*padding+1) + (j+padding); get_sequence(reference,chr,pos,pos+seq.size()+(call.stop+j-call.start-i),call.start+i,call.stop+j,seq1); - std::cout << "--------------- " << i << "," << j << " ---------------------------------" << std::endl; + std::cout << "--------------- " << i << "," << j << " --------------------------------- event: " << id << std::endl; events(id) = calculate_event_probability(seq2,seq1,seq3,seq.size(),call.stop-call.start,chr,pos,3e-3,chr_ins,reference); // Print @@ -2864,9 +2969,11 @@ Eigen::ArrayXd Evidence_matrix_log(openfpm::vector<openfpm::vector<char>> & refe for (int i = 0 ; i < un_seed.size() ; i++) { get_sequence(reference,un_seed.get<0>(i),un_seed.get<1>(i)-distance_from_seed+1,un_seed.get<1>(i)-distance_from_seed+1+seq.size(),-1,-1,seq1); int id = (2*padding+1)*(2*padding+1) + i; - std::cout << "------------------- un_seed: " << i << " -----------------------------" << std::endl; + std::cout << "------------------- un_seed: " << i << " ----------------------------- event: " << id << std::endl; events(id) = calculate_event_probability(seq2,seq1,seq3,seq.size(),-1,chr,un_seed.get<1>(i)-distance_from_seed+1,3e-3,chr_ins,reference); + std::cout << "SEED position: " << un_seed.get<1>(i) << std::endl; + // Print for (int j = 0 ; j < seq.size() ; j++) { @@ -2994,7 +3101,7 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, delta_log.resize(seqs.size()); Pre.resize(seqs.size()); - int s = permutation.get(0); + int s = permutation.get(0)/*673*/; int seed_pos = seqs.get<4>(s); delta_log.get(0).resize(Initial_log.size()); @@ -3014,6 +3121,7 @@ void CalculateViterbi(openfpm::vector<openfpm::vector<char>> & reference, for (int i = 1 ; i < seqs.size() ; i++) { int s = permutation.get(i); + seed_pos = seqs.get<4>(s); Eigen::VectorXd next_transition(Initial_log.size()); std::cout << "Initial Array: " << Initial_log.array() << std::endl; std::cout << "Event probs: " << delta_log.get(i-1) << std::endl; @@ -3123,7 +3231,7 @@ void CNV_event_test(openfpm::vector<call_vcf> & calls, add_32bases_pattern(calls.get(i).chromosome,calls.get(i).start-32+j-offset, - reference,map,i,-32+j-offset); + reference,map,i,-32+j-offset,-1); } // Store the patterns related to the stop @@ -3137,7 +3245,7 @@ void CNV_event_test(openfpm::vector<call_vcf> & calls, std::cout << "ADDING sequence at " << calls.get(i).chromosome << ":" << calls.get(i).stop+j << " SEQUENCE: " << seq << std::endl; add_32bases_pattern(calls.get(i).chromosome,calls.get(i).stop+j, - reference,map,i,j); + reference,map,i,j,-1); } } @@ -3154,7 +3262,9 @@ void CNV_event_test(openfpm::vector<call_vcf> & calls, std::cout << "Saving sequences" << std::endl; seqs.save("seqs_call_vector"); - std::cout << "Saved sequences" << std::endl;*/ + std::cout << "Saved sequences" << std::endl; + + return;*/ seqs.load("seqs_call_vector"); diff --git a/example/Sequencing/CNVcaller/testing.cpp b/example/Sequencing/CNVcaller/testing.cpp index 2382594dc006d4b70f363a5908f670d36bec70be..ffbe4a0ace52a21afcfa3a6975c44cb9a22d597d 100644 --- a/example/Sequencing/CNVcaller/testing.cpp +++ b/example/Sequencing/CNVcaller/testing.cpp @@ -4,6 +4,16 @@ BOOST_AUTO_TEST_SUITE( CNV_caller_test_suite ) +BOOST_AUTO_TEST_CASE( CNV_caller_test_read_CNV_hypothesys ) +{ + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_one_CNV.vcf.gz",calls_to_test); + + BOOST_REQUIRE_EQUAL(calls_to_test.get(0).chromosome,0); + BOOST_REQUIRE_EQUAL(calls_to_test.get(0).start,44605948); + BOOST_REQUIRE_EQUAL(calls_to_test.get(0).stop,44606030); +} + BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern ) { openfpm::vector<openfpm::vector<char>> reference; @@ -12,23 +22,321 @@ BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern ) reference.load("reference_vector"); // chromosome 0 - add_32bases_pattern(0,44605910,reference,map,-1,-2); + add_32bases_pattern(0,44605910,reference,map,0,-1,NO_PADDING); BOOST_REQUIRE_EQUAL(map.size(),1); auto i = map.begin(); BOOST_REQUIRE_EQUAL(i->first >> 62,0); BOOST_REQUIRE_EQUAL(i->first & 0x3,3); + BOOST_REQUIRE_EQUAL(i->second.size(),1); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_left, -1); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); map.clear(); - add_32bases_pattern(0,44605911,reference,map,-1,-2); + add_32bases_pattern(0,44605911,reference,map,1,-2,NO_PADDING); BOOST_REQUIRE_EQUAL(map.size(),1); i = map.begin(); BOOST_REQUIRE_EQUAL(i->first >> 62,3); BOOST_REQUIRE_EQUAL(i->first & 0x03,2); + + + + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_left, -2); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); +} + + +BOOST_AUTO_TEST_CASE( CNV_caller_test_get_Nbases_pattern ) +{ + openfpm::vector<openfpm::vector<char>> reference; + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + reference.load("reference_vector"); + + size_t seed = 0; + + // 1 base + CNVcaller::get_Nbases_pattern(0,44605910,reference,1,seed); + BOOST_REQUIRE_EQUAL(seed,0); + + // 1 base + CNVcaller::get_Nbases_pattern(0,44605911,reference,1,seed); + BOOST_REQUIRE_EQUAL(seed,3); + + // 1 base + CNVcaller::get_Nbases_pattern(0,44605912,reference,1,seed); + BOOST_REQUIRE_EQUAL(seed,15); + + seed = 0; + // 1 base + CNVcaller::get_Nbases_pattern(0,44605911,reference,1,seed); + BOOST_REQUIRE_EQUAL(seed,3); + + // 1 base + CNVcaller::get_Nbases_pattern(0,44605911,reference,1,seed); + BOOST_REQUIRE_EQUAL(seed,15); + + seed = 0; + // 2 base + CNVcaller::get_Nbases_pattern(0,44605911,reference,2,seed); + BOOST_REQUIRE_EQUAL(seed,15); + + seed = 0; + // 3 base + CNVcaller::get_Nbases_pattern(0,44605910,reference,3,seed); + BOOST_REQUIRE_EQUAL(seed,15); + + seed = 0; + // 9 bases + CNVcaller::get_Nbases_pattern(0,44605911,reference,9,seed); + BOOST_REQUIRE_EQUAL(seed & 0x03,2); + + seed = 0; + // 8 bases + CNVcaller::get_Nbases_pattern(0,44605911,reference,8,seed); + BOOST_REQUIRE_EQUAL(seed & 0x03,3); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_test_add_32bases_pattern_with_gap ) +{ + openfpm::vector<openfpm::vector<char>> reference; + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + reference.load("reference_vector"); + + // chromosome 0 + CNVcaller::add_32bases_pattern_with_gap(0,44605910,reference,map,44605920,44605920,0,-1,NO_PADDING); + + BOOST_REQUIRE_EQUAL(map.size(),1); + auto i = map.begin(); + BOOST_REQUIRE_EQUAL(i->first >> 62,0); + BOOST_REQUIRE_EQUAL(i->first & 0x3,3); + + BOOST_REQUIRE_EQUAL(i->second.size(),1); + + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_left, -1); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); + + map.clear(); + + CNVcaller::add_32bases_pattern_with_gap(0,44605911,reference,map,44605920,44605930,1,-2,NO_PADDING); + + BOOST_REQUIRE_EQUAL(map.size(),1); + i = map.begin(); + BOOST_REQUIRE_EQUAL(i->first >> 62,3); + BOOST_REQUIRE_EQUAL(i->first & 0x03,0); + + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_left, -2); + BOOST_REQUIRE_EQUAL(i->second.get(0).padding_right, NO_PADDING); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_test_collect_reads_for_call_evidence ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_one_CNV.vcf.gz",calls_to_test); + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + All_samples_call_related_relevant_read_type seqs; + + // remeber is 0-based index, call id is 0 + add_32bases_pattern(0,44605798,reference,map,0,-2,NO_PADDING); + + seqs.resize(1); + + std::string test_bam("test/HG002_one_read.bam"); + collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6); + + // Despite the read follow the seed, it is irrelevant for the call + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),0); + + // Reset + map.clear(); + seqs.clear(); + seqs.resize(1); + + // remeber is 0-based index, call id is 0 + add_32bases_pattern(0,44605948-32,reference,map,0,0,NO_PADDING); + + std::string test_bam2("test/HG002_one_read_cnv.bam"); + collect_reads_for_call_evidence(test_bam2,map,calls_to_test,seqs.get<0>(0),reference,6); + + // Despite the read follow the seed, it is irrelevant for the call + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),1); + + char sequence[] = "CGTCGTCTGCTCTCCACAGAGGGTCTGGTTACTAGTGGAGGTCATCACTTGCTCTCCACAGAGGTTCTGGTTACCAGTGGAGGTCATCATCTGCCCCCAGT"; + + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<0>(0).size(),101); + + for (int i = 0 ; i < 100 ; i++) { + BOOST_REQUIRE_EQUAL(sequence[i],seqs.get<0>(0).get<0>(0).get<0>(0).get<0>(i)); + } + + // Check the reverse read is detected + + char sequence_rev[] = "GAGGTTGTCATCTGCTCTCCACAGAGGTTCTGGTTACCAGTGGAGGTCATCATCTGCCCCCAGTAGAGGGTCTGGTTACCAGTGGAGGTCATCATCTGCCC"; + + for (int i = 0 ; i < 100 ; i++) { + BOOST_REQUIRE_EQUAL(sequence_rev[i],seqs.get<0>(0).get<0>(0).get<0>(0).get<1>(i)); + } + + // Read name + char read_name[] = "A00684:261:HNJK2DRX2:2:2228:29044:4194"; + + for (int i = 0 ; i < sizeof(read_name) ; i++) { + BOOST_REQUIRE_EQUAL(read_name[i],seqs.get<0>(0).get<0>(0).get<0>(0).get<2>(i)); + } + + // Check chromosome 0 mean chr1 + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<1>(0),0); + // Check seed position + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<4>(0),2); + + // Property 2 contain the position of the read in case the event is matched + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<2>(0),44605914); + + // Reset + map.clear(); + seqs.clear(); + seqs.resize(1); + + // -1 is padding left (or start of the event) + add_32bases_pattern(0,44605947-32,reference,map,0,-1,NO_PADDING); + collect_reads_for_call_evidence(test_bam2,map,calls_to_test,seqs.get<0>(0),reference,6); + + // Check seed position + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<4>(0),1); + + // Property 2 contain the position of the read in case the event is matched + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).get<2>(0),44605914); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_test_get_reference_points ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_one_CNV.vcf.gz",calls_to_test); + + openfpm::vector<openfpm::vector<aggregate<short int, int, int>>> un_seed; + un_seed.resize(calls_to_test.size()+1); + + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + size_t bases_pattern; + int ret = CNVcaller::get_32bases_pattern(0,44605915,reference,bases_pattern); // create the seed number + + BOOST_REQUIRE_EQUAL(ret,0); + + CNVcaller::seed_pos sp; + sp.chr = 0; // chromosome 1 + sp.pos = 44605915; + sp.cid = 1; + + openfpm::vector<CNVcaller::seed_id> seeds; + CNVcaller::seed_id seed; + seed.seed = bases_pattern; + seed.calls.add(sp); + seeds.add(seed); + + // Add one seed for testing + CNVcaller::seed_id tmp; + + get_reference_point(seeds,reference,un_seed); + + // Check that across the suggested positions there is + + bool found = false; + + BOOST_REQUIRE_EQUAL(un_seed.size(),2); + + for (int i = 0 ; i < un_seed.get(1).size() ; i++) { + if (un_seed.get(1).get<0>(i) == 0 && + un_seed.get(1).get<1>(i) == 44605915 && + un_seed.get(1).get<2>(i) == 0) { + + // Found the original seed point + found = true; + + } else { + // Check that the seed point generate the same sequence + + for (int j = 0 ; j < 32 ; j++) { + char base = CNVcaller::get_base_from_reference(reference,0,44605915+j); + char base2 = CNVcaller::get_base_from_reference(reference,0,un_seed.get(1).get<1>(i)+j); + + BOOST_REQUIRE_EQUAL(base,base2); + } + } + } + + BOOST_REQUIRE_EQUAL(found,true); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_test_collect_and_test_read ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_one_CNV.vcf.gz",calls_to_test); + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + All_samples_call_related_relevant_read_type seqs; + + // remeber is 0-based index, call id is 0, padding between call-start and seed is -1 + int offset = -1; + add_32bases_pattern(0,calls_to_test.get(0).start-32+offset,reference,map,0,offset,NO_PADDING); + + seqs.resize(1); + + std::string test_bam("test/HG002_one_read_cnv.bam"); + collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6,0); + + // Here we should have one read collected for call zero + + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).size(),1); + + // The read is not relevant for the reference + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),1); +} + +BOOST_AUTO_TEST_CASE( CNV_caller_test_collect_and_test_read_multiple_read_multiple_calls ) +{ + openfpm::vector<openfpm::vector<char>> reference; + reference.load("reference_vector"); + + openfpm::vector<CNVcaller::call_vcf> calls_to_test; + CNVcaller::read_CNV_hypothesys("test/test_multiple_CNV.vcf.gz",calls_to_test); + tsl::hopscotch_map<size_t, openfpm::vector<CNVcaller::CNV_event_link>> map; + + All_samples_call_related_relevant_read_type seqs; + + // remeber is 0-based index, call id is 0, padding between call-start and seed is -1 + int offset = 0; + add_32bases_pattern(2,112743114-32,reference,map,0,0,NO_PADDING); + add_32bases_pattern(2,112751315,reference,map,0,0,NO_PADDING); + add_32bases_pattern(2,195451990-32,reference,map,1,0,NO_PADDING); + add_32bases_pattern(3,64513-32,reference,map,2,0,NO_PADDING); + + + seqs.resize(3); + + std::string test_bam("/nvme/bams/batch_1/DE78NGSUKBD134462_76322.bam"); + collect_reads_for_call_evidence(test_bam,map,calls_to_test,seqs.get<0>(0),reference,6,0); + + // Here we should have one read collected for call zero + + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).size(),1); + + // The read is not relevant for the reference + BOOST_REQUIRE_EQUAL(seqs.get<0>(0).get<0>(0).size(),1); } BOOST_AUTO_TEST_SUITE_END()