From 8de43785da1a8d909445a4cdd37f92f59fdd64d3 Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Sat, 18 Nov 2023 22:51:49 +0100 Subject: [PATCH] Fixing confidence on calling --- example/Sequencing/CNVcaller/Makefile | 6 +- example/Sequencing/CNVcaller/main.cu | 510 +-------------------- example/Sequencing/CNVcaller/stat_calls.py | 6 +- 3 files changed, 15 insertions(+), 507 deletions(-) diff --git a/example/Sequencing/CNVcaller/Makefile b/example/Sequencing/CNVcaller/Makefile index b772f48b8..41c562513 100644 --- a/example/Sequencing/CNVcaller/Makefile +++ b/example/Sequencing/CNVcaller/Makefile @@ -12,7 +12,7 @@ else CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 - LIBS_SELECT=$(LIBS) -lhts + LIBS_SELECT=$(LIBS) -lhts -lgsl else ifeq (, $(shell which nvcc)) CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) @@ -20,7 +20,7 @@ else else CUDA_CC=nvcc -ccbin=mpic++ endif - LIBS_SELECT=$(LIBS) -lhts + LIBS_SELECT=$(LIBS) -lhts -lgsl endif endif @@ -32,7 +32,7 @@ OBJ_DEPTH = main_depth.o all: %.o: %.cu - $(CUDA_CC) -O3 -g $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CUDA_CC) -O0 -g $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp $(CC) -O0 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) diff --git a/example/Sequencing/CNVcaller/main.cu b/example/Sequencing/CNVcaller/main.cu index 083d4cfb0..c9b1404b7 100644 --- a/example/Sequencing/CNVcaller/main.cu +++ b/example/Sequencing/CNVcaller/main.cu @@ -20,326 +20,6 @@ int n_count_all = 0; #include <fstream> #include <random> -struct BreakPoint { - short int chr; - int pos; - int count; - int depth = 0; - - BreakPoint() {} - - BreakPoint(const short int & chr, const int & pos, const int & count) - :chr(chr),pos(pos),count(count) - {} - - bool operator<(const BreakPoint & b) const { - if (chr < b.chr) return true; - if (pos < b.pos) return true; - - return false; - } -}; - - -void break_intervals(openfpm::vector<aggregate<short int,int,int>> & intervals, openfpm::vector<aggregate<short int, int, int, int>> & breaks) { - int j = 0; - - if (breaks.size() == 0) {return;} - - openfpm::vector<openfpm::vector<aggregate<short int,int,int>>> intervals_tmp; - intervals_tmp.resize(intervals.size()); - - for (int i = 0 ; i < intervals.size() ; i++) { - intervals_tmp.get(i).add(intervals.get(i)); - } - - for (int i = 0 ; i < intervals.size() ; ) { - - short int chrom_b = breaks.template get<0>(j); - int start_b = breaks.template get<1>(j); - - short int chrom_i = intervals.template get<0>(i); - int start_i = intervals.template get<1>(i); - int stop_i = intervals.template get<2>(i); - - if (chrom_b > chrom_i) { - i++; - continue; - } - - if (start_b >= stop_i) { - i++; - continue; - } - - if (start_b <= start_i) { - j++; - continue; - } - - // if the breakpoint is 50 bases near to the start or stop of the interval skip it - if (fabs(start_i - start_b) <= 50) {j++ ; continue;} - if (fabs(start_b - stop_i) <= 50) {j++; continue;} - - int last = intervals_tmp.get(i).size() - 1; - intervals_tmp.get(i).remove(last); - intervals_tmp.get(i).add(); - intervals_tmp.get(i).get<0>(last) = chrom_b; - intervals_tmp.get(i).get<1>(last) = start_i; - intervals_tmp.get(i).get<2>(last) = start_b; - intervals_tmp.get(i).add(); - intervals_tmp.get(i).get<0>(last+1) = chrom_b; - intervals_tmp.get(i).get<1>(last+1) = start_b; - intervals_tmp.get(i).get<2>(last+1) = stop_i; - j++; - } - - - intervals.clear(); - - for (int i = 0 ; i < intervals_tmp.size() ; i++) { - for (int j = 0 ; j < intervals_tmp.get(i).size() ; j++ ) { - intervals.add(intervals_tmp.get(i).get(j)); - } - } -} - - -void find_breakpoint(openfpm::vector<BreakPoint> & brk, short int chrom_t, int start_t , int stop_t, int & b1, int & b2) { - int i = 0; - int end = brk.size() - 1; - - while (end != i) { - int mid = (end - i) / 2 + i; - short int mid_chromosome = brk.get(mid).chr; - int mid_pos = brk.get(mid).pos; - - if (chrom_t < mid_chromosome) {end = mid; continue;} - if (chrom_t > mid_chromosome) {i = mid; continue;} - - if (start_t > mid_pos) {i = mid+1; continue;} - if (start_t < mid_pos) {end = mid; continue;} - - break; - } - - b1 = i; - - i = 0; - end = brk.size() - 1; - - while (end != i) { - int mid = (end - i) / 2 + i; - short int mid_chromosome = brk.get(mid).chr; - int mid_pos = brk.get(mid).pos; - - if (chrom_t < mid_chromosome) {end = mid; continue;} - if (chrom_t > mid_chromosome) {i = mid; continue;} - - if (stop_t > mid_pos) {i = mid + 1; continue;} - if (stop_t < mid_pos) {end = mid; continue;} - - break; - } - - b2 = i; -} - -template <typename T, typename U> -struct PairHash { - std::size_t operator () (const std::pair<T, U> &p) const { - // Combine the two integers into a single hash value - std::size_t h1 = std::hash<T>{}(p.first); - std::size_t h2 = std::hash<U>{}(p.second); - return h1 ^ h2; // You can choose a different combining function if needed - } -}; - -void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short int,int,int, int>> & break_points_out) { - - std::unordered_map<std::pair<short int, int>, int, PairHash<short,int>> break_points; - openfpm::vector<BreakPoint> brk; - - // Read the presence of break points - - { - samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file - bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header - bam1_t *aln = bam_init1(); //initialize an alignment - - while(sam_read1(fp_in,bamHdr,aln) > 0){ - - int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) - if (aln->core.tid == -1) - {continue;} - char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) - uint32_t len = aln->core.l_qseq; //length of the read. - int n_cigar = aln->core.n_cigar; // number of cigar operations - uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); - - short int chr_int = chr_to_chr_int(chr); - if (chr_int == -1) {continue;} - - for (int k = 0 ; k < n_cigar ; k++) { - int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); - int len = bam_cigar_oplen(cigar[k]); - if (op == 'S') { - break_points[std::make_pair(chr_int, pos)] += 1; - pos += len; - } - else { - pos += len; - } - } - } - - bam_destroy1(aln); - sam_close(fp_in); - } - - for (auto it = break_points.begin() ; it != break_points.end() ; ++it) { - if ((*it).second < 3) {continue;} - brk.add(BreakPoint((*it).first.first,(*it).first.second,(*it).second)); - } - - brk.sort(); - - { - // Calculate the depth on the break point - - samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file - bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header - bam1_t *aln = bam_init1(); //initialize an alignment - - while(sam_read1(fp_in,bamHdr,aln) > 0){ - - int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) - if (aln->core.tid == -1) - {continue;} - char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) - uint32_t len = aln->core.l_qseq; //length of the read. - int n_cigar = aln->core.n_cigar; // number of cigar operations - uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); - - short int chr_int = chr_to_chr_int(chr); - if (chr_int == -1) {continue;} - - // Fill depth - for (int k = 0 ; k < n_cigar ; k++) { - int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); - int len = bam_cigar_oplen(cigar[k]); - if (op == 'M') { - int b1 = 0; - int b2 = 0; - find_breakpoint(brk,chr_int,pos,pos+len,b1,b2); - for (int i = b1 ; i < b2 ; i++) { - brk.get(i).depth++; - } - } - else { - pos += len; - } - } - } - - bam_destroy1(aln); - sam_close(fp_in); - } - - - // valid break are all points - - // Copy to the output - - break_points_out.clear(); - break_points_out.add(); - break_points_out.last().get<0>() = brk.get(0).chr; - break_points_out.last().get<1>() = brk.get(0).pos; - break_points_out.last().get<2>() = brk.get(0).count; - - for (int i = 0 ; i < brk.size() ; ) { - short int chr; - int pos_sum = 0; - int count_sum = 0; - int depth_sum = 0; - int k; - for (k = i ; k < brk.size() && fabs(break_points_out.last().get<1>() - brk.get(k).pos) < 100 ; k++) { - chr = brk.get(k).chr; - pos_sum += brk.get(k).pos; - count_sum += brk.get(k).count; - depth_sum += brk.get(k).depth; - } - - int pos = std::round(pos_sum / (k - i)); - int depth = std::round(depth_sum / (k - i)); - break_points_out.last().get<0>() = chr; - break_points_out.last().get<1>() = pos; - break_points_out.last().get<2>() = count_sum; - break_points_out.last().get<3>() = depth_sum; - - if (k < brk.size()) { - break_points_out.add(); - break_points_out.last().get<0>() = brk.get(k).chr; - break_points_out.last().get<1>() = brk.get(k).pos; - break_points_out.last().get<2>() = brk.get(k).count; - break_points_out.last().get<3>() = brk.get(k).depth; - } - i = k; - } - - openfpm::vector<int> id_to_remove; - // Filter breakpoints by depth - for (int i = 0 ; i < break_points_out.size() ; ++i) { - if (break_points_out.get<3>(i)*0.1 >= 3 && break_points_out.get<3>(i)*0.1 > break_points_out.get<2>(i)) { - id_to_remove.add(i); - } - } - break_points_out.remove(id_to_remove); -} - - -void read_bed_file(std::string bed_f, openfpm::vector<aggregate<short int,int,int>> & intervals,int pad) { - std::ifstream bedFile(bed_f.c_str()); - - if (!bedFile.is_open()) { - std::cerr << "Error: Unable to open the BED file." << std::endl; - return; - } - - std::string line; - - char str[256]; - - while (std::getline(bedFile, line)) { - std::istringstream iss(line); - - iss >> str; - int chr_int = chr_to_chr_int(str); - if (chr_int == -1) {continue;} - - intervals.add(); - intervals.last().get<0>() = chr_int; - iss >> intervals.last().get<1>(); - iss >> intervals.last().get<2>(); - } - - bedFile.close(); -} - - -template<typename T> T penalization_fishing(T & a, T left, T right) { - T min_distance = std::min(fabs(a-left)*1000,fabs(a-right)*1000); - - return min_distance; -} - -std::string basename(const std::string &path) { - size_t found = path.find_last_of("/"); - if (found != std::string::npos) { - return path.substr(found + 1); - } - return path; -} int main(int argc, char* argv[]) { @@ -352,23 +32,13 @@ int main(int argc, char* argv[]) openfpm::vector<aggregate<short int,int, int>> break_point; openfpm::vector<aggregate<short int,int, int>> intervals; - openfpm::vector<aggregate<short int,int, int, int>> read_splits; + openfpm::vector<openfpm::vector<aggregate<short int,int, int, int>>> read_splits; // Read bed file for interval + std::cout << "READING: Bed file" << std::endl; read_bed_file("/nvme/bams/Twist_Exome_Core_RefSeq_Targets_b37_chr.bed",intervals,0); - // Calculate the number of bases in the interval - - int n_bases = 0; - for (int i = 0 ; i < intervals.size() ; i++) { - n_bases += intervals.get<2>(i) - intervals.get<1>(i); - } - - openfpm::vector<int> chr_counts; - openfpm::vector<int> counts; - int N_chr = 24; - openfpm::vector<std::string> files_bams; files_bams.add("/nvme/bams/DE14NGSUKBD138718_97851_cut.bam"); @@ -384,184 +54,22 @@ int main(int argc, char* argv[]) files_bams.add("/nvme/bams/batch_1/DE12NGSUKBD138172_95188_cut.bam"); int N_samples = files_bams.size(); - chr_counts.resize(N_chr*N_samples); // Read break points + read_splits.resize(N_samples); for (int i = 0 ; i < files_bams.size() ; i++) { - read_break_points(files_bams.get(i),read_splits); - break_intervals(intervals,read_splits); - read_splits.clear(); - } - - counts.resize(N_samples*intervals.size()); - - for (int i = 0 ; i < files_bams.size() ; i++) { - read_sample(files_bams.get(i),counts,chr_counts,intervals,i,N_samples); - } - - // count by depth - int count = 0; - for (int j = 0 ; j < intervals.size() ; j++) { - for (int i = 0 ; i < N_samples ; i++) { - counts.get(N_samples*j + i) /= intervals.get<2>(j) - intervals.get<1>(j); - } + read_break_points(files_bams.get(i),read_splits.get(i)); + break_intervals(intervals,read_splits.get(i)); } constexpr int N_int_opt = 1; constexpr int N_sample_opt = 10; - constexpr int dim = N_int_opt + N_sample_opt; - Box<dim,double> domain; - - for (size_t i = 0 ; i < N_int_opt ; i++) - { - domain.setLow(i,0.0); - domain.setHigh(i,1.0); - } - - for (size_t i = N_int_opt ; i < N_int_opt + N_sample_opt ; i++) - { - domain.setLow(i,0.0); - domain.setHigh(i,6.0); - } - - // Divide chromosome to get max_count - double factor = (double)1.0 / (double)n_bases; - - for (int j = 0 ; j < N_sample_opt ; j++) { - chr_counts.get(chromosome*N_samples+j)*=factor; - } - - auto & v_cl = create_vcluster(); - if (v_cl.rank() == 0) { - - for (int j = 0 ; j < N_chr ; j++) { - std::cout << "chr " << j << " "; - for (int i = 0 ; i < N_samples ; i++) { - std::cout << " " << chr_counts.get(N_samples*j + i); - } - std::cout << std::endl; - } - } - - openfpm::vector<aggregate<float[16]>> solutions; - openfpm::vector<aggregate<float[16]>> confidence; - solutions.resize(intervals.size()); - confidence.resize(intervals.size()); - - int n_interval_to_process = 0; - for (int i = 0 ; i < intervals.size() ; i++) { - bool is_zero = true; - for (int j = 0 ; j < N_samples; j++) { - if (counts.get(N_samples*i + j) != 0) { - is_zero = false; - } - } - if (is_zero == true) {continue;} - n_interval_to_process++; - } - - std::ofstream fout; - - if (v_cl.rank() == 0) { - // Open a TSV file for writing - fout.open("calls.tsv"); - if (!fout.is_open()) { - std::cerr << "Failed to open the file for writing.\n"; - return 1; - } - fout << "chromosome" << '\t' << "start" << '\t' << "stop"; - for (int j = 0 ; j < N_sample_opt ; j++) { - fout << '\t' << basename(files_bams.get(j)); - } - fout << '\t' << "value" << '\t' << "CALL" << std::endl; - } - - int interval_processed = 0; - for (int i = 0 ; i < intervals.size() ; i++) { - float reg = 2000; - int selected_interval = i; - double fishing_left = 0.5; - double fishing_right = 0.5; - auto lamb_eval = [&](Eigen::VectorXd & vars){ - return model_function<true,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); - }; - - auto lamb = [&](Eigen::VectorXd & vars){ - return model_function<false,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); - }; - - // Check there is data - - bool is_zero = true; - for (int j = 0 ; j < N_samples; j++) { - if (counts.get(N_samples*i + j) != 0) { - is_zero = false; - } - } - if (is_zero == true) {continue;} - - openfpm::vector<double> sol; - if (v_cl.rank() == 0) { - std::cout << "Processing interval: " << interval_processed << "/" << n_interval_to_process << std::endl; - std::cout << "Chromosome: " << intervals.get<0>(i)+1 << " start: " << intervals.get<1>(i) << " stop: " << intervals.get<2>(i) << std::endl; - } - optimize(domain,0.0,sol,lamb); - if (v_cl.rank() == 0) {std::cout << std::endl;} - interval_processed++; - bool is_call = false; - fout << intervals.get<0>(i)+1 << '\t' << intervals.get<1>(i) << '\t' << intervals.get<2>(i); - for (int k = 0 ; k < sol.size() ; k++) { - solutions.get<0>(i)[k] = sol.get(k); - if (k != 0 && std::round(solutions.get<0>(i)[k]) != 2) { - is_call = true; - } - if (v_cl.rank() == 0) - {fout << '\t' << solutions.get<0>(i)[k];} - } - Eigen::VectorXd vars; - vars.resize(sol.size()); - for (int s = 0 ; s < sol.size() ; s++) { - vars(s) = sol.get(s); - } - - // calculate confidence - if (v_cl.rank() == 0) { - std::cout << "CONFIDENCE: "; - for (int i = N_int_opt ; i < N_int_opt + N_sample_opt ; i++) { - confidence.get<0>(selected_interval)[i] = model_function_confidence<N_int_opt, N_sample_opt>(selected_interval,i-N_int_opt,vars,chr_counts,counts,chromosome); - std::cout << confidence.get<0>(selected_interval)[i] << " "; - } - std::cout << std::endl; - } - - if (v_cl.rank() == 0) - {fout << '\t' << lamb(vars);} - if (v_cl.rank() == 0) { - if (is_call == true) { - std::cout << "----------------------------------------------------------------------------------------" << std::endl; - std::cout << "-----------------------------------CALL-------------------------------------------------" << std::endl; - std::cout << "----------------------------------------------------------------------------------------" << std::endl; - fout << '\t' << "CALL" << std::endl; - } - else { - fout << '\t' << "." << std::endl; - } - for (int j = 0 ; j < N_sample_opt ; j++) { - std::cout << basename(files_bams.get(j)) << " " << solutions.get<0>(i)[1+j] << " "; - } - } - fout << std::flush; - } + openfpm::vector<aggregate<float[N_sample_opt+2]>> solutions; + openfpm::vector<aggregate<float[N_sample_opt]>> confidence; - // For each solutions -/* for (int i = 0 ; i < intervals.size() ; i++) { - std::cout << "Chromosome: " << intervals.get<0>(i) << " start: " << intervals.get<1>(i) << " stop: " << intervals.get<2>(i) << " bait efficency: " << solutions.get<0>(i)[0] << " CNs "; - for (int j = 0 ; j < N_sample_opt ; j++) { - std::cout << basename(files_bams.get(j)) << " " << solutions.get<0>(i)[1+j] << " "; - } - std::cout << std::endl; - }*/ + call_single_interval_mean_depth<N_int_opt,N_sample_opt>(intervals,files_bams,solutions,confidence); + //call_single_interval_mean_depth_local<N_int_opt,N_sample_opt>(intervals,files_bams,solutions,confidence) openfpm_finalize(); } diff --git a/example/Sequencing/CNVcaller/stat_calls.py b/example/Sequencing/CNVcaller/stat_calls.py index 5ea9a4737..7eaf31390 100644 --- a/example/Sequencing/CNVcaller/stat_calls.py +++ b/example/Sequencing/CNVcaller/stat_calls.py @@ -1,7 +1,7 @@ -import np as np +import numpy as np import matplotlib.pyplot as plt -f = open("calls_material_batch2.tsv","r") +f = open("calls.tsv","r") # read the first line line = f.readline() @@ -41,4 +41,4 @@ plt.ylabel('Frequency') # Show the plot plt.show() -print(samples) \ No newline at end of file +print(samples) -- GitLab