diff --git a/example/Sequencing/CNVcaller/Makefile b/example/Sequencing/CNVcaller/Makefile index 4a8f70b699eb3860dce2731ef9dff0fb47e7ed3e..b772f48b8ed1ed42b183244c6a736a9cb128a44d 100644 --- a/example/Sequencing/CNVcaller/Makefile +++ b/example/Sequencing/CNVcaller/Makefile @@ -1,4 +1,4 @@ -include ../../example.mk +include example.mk CUDA_CC= CC=mpic++ @@ -26,8 +26,10 @@ endif OBJ = main.o +OBJ_CN2 = main_CN2.o +OBJ_DEPTH = main_depth.o -CNV_caller: +all: %.o: %.cu $(CUDA_CC) -O3 -g $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) @@ -38,7 +40,13 @@ CNV_caller: CNV_caller: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) -all: CNV_caller +CNV_caller_CN2: $(OBJ_CN2) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) + +CNV_caller_depth: $(OBJ_DEPTH) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) + +all: CNV_caller CNV_caller_CN2 CNV_caller_depth run: CNV_caller mpirun --oversubscribe -np 2 ./CNV_caller diff --git a/example/Sequencing/CNVcaller/main.cu b/example/Sequencing/CNVcaller/main.cu index df2e1d2f20251970f3f60151139237caf807ef13..083d4cfb0e2171cdce4e53e7ce019d862d256c76 100644 --- a/example/Sequencing/CNVcaller/main.cu +++ b/example/Sequencing/CNVcaller/main.cu @@ -16,11 +16,9 @@ int n_count_all = 0; #include "VCluster/VCluster.hpp" #include "optimizer.hpp" #include <unordered_map> - -constexpr int chromosome = 0; -constexpr int start_interval = 1; -constexpr int stop_interval = 2; -constexpr int Num_bases = 3; +#include "functions.hpp" +#include <fstream> +#include <random> struct BreakPoint { short int chr; @@ -28,6 +26,8 @@ struct BreakPoint { int count; int depth = 0; + BreakPoint() {} + BreakPoint(const short int & chr, const int & pos, const int & count) :chr(chr),pos(pos),count(count) {} @@ -40,7 +40,8 @@ struct BreakPoint { } }; -void break_intervals(openfpm::vector<aggregate<short int,int,int>> & intervals, openfpm::vector<aggregate<short int, int, int>> & breaks) { + +void break_intervals(openfpm::vector<aggregate<short int,int,int>> & intervals, openfpm::vector<aggregate<short int, int, int, int>> & breaks) { int j = 0; if (breaks.size() == 0) {return;} @@ -103,81 +104,45 @@ void break_intervals(openfpm::vector<aggregate<short int,int,int>> & intervals, } } -void find_interval(openfpm::vector<aggregate<short int,int,int>> & intervals, short int chrom_t, int start_t , int & interval) { - if (interval != -1 && - intervals.get<chromosome>(interval) == chrom_t && - start_t < intervals.get<stop_interval>(interval) && - start_t >= intervals.get<start_interval>(interval) ) - {return;} - - interval = 0; - int end = intervals.size() - 1; - - while (end != interval) { - int mid; - if (end - interval == 1) {mid = end; interval = mid;} - else - {mid = (end - interval) / 2 + interval;} - short int mid_chromosome = intervals.get<chromosome>(mid); - int mid_start = intervals.get<start_interval>(mid); - int mid_stop = intervals.get<stop_interval>(mid); - - short int i_chromosome = intervals.get<chromosome>(interval); - int i_start = intervals.get<start_interval>(interval); - int i_stop = intervals.get<stop_interval>(interval); - - if (chrom_t < mid_chromosome) {end = mid; continue;} - if (chrom_t > mid_chromosome) {interval = mid; continue;} - - if (start_t < mid_start) {end = mid; continue;} - if (start_t >= mid_stop) {interval = mid; continue;} - - interval = mid; - return; - } - - interval = -1; - return; -} -int find_breakpoint(openfpm::vector<BreakPoint> & brk, short int chrom_t, int start_t , int stop_t) { +void find_breakpoint(openfpm::vector<BreakPoint> & brk, short int chrom_t, int start_t , int stop_t, int & b1, int & b2) { int i = 0; int end = brk.size() - 1; while (end != i) { - int mid; - if (end - i == 1) {mid = end; i = mid;} - else - {mid = (end - i) / 2 + i;} + int mid = (end - i) / 2 + i; short int mid_chromosome = brk.get(mid).chr; int mid_pos = brk.get(mid).pos; if (chrom_t < mid_chromosome) {end = mid; continue;} if (chrom_t > mid_chromosome) {i = mid; continue;} + if (start_t > mid_pos) {i = mid+1; continue;} if (start_t < mid_pos) {end = mid; continue;} - if (stop_t >= mid_pos) {i = mid; continue;} - i = mid; - return i; + break; } - i = -1; - return i; -} + b1 = i; + + i = 0; + end = brk.size() - 1; + + while (end != i) { + int mid = (end - i) / 2 + i; + short int mid_chromosome = brk.get(mid).chr; + int mid_pos = brk.get(mid).pos; + + if (chrom_t < mid_chromosome) {end = mid; continue;} + if (chrom_t > mid_chromosome) {i = mid; continue;} -void add_to_interval(openfpm::vector<int> & base_count, - openfpm::vector<aggregate<short int,int,int>> & intervals, - int interval, - int sample, - int N_samples, - int pos, - int num) { + if (stop_t > mid_pos) {i = mid + 1; continue;} + if (stop_t < mid_pos) {end = mid; continue;} - int overlap_start = std::max(pos,intervals.template get<start_interval>(interval)); - int overlap_stop = std::min(pos+num,intervals.template get<stop_interval>(interval)); + break; + } - base_count.get<0>(N_samples*interval + sample) += std::max(overlap_stop - overlap_start,0); + b2 = i; } template <typename T, typename U> @@ -190,7 +155,7 @@ struct PairHash { } }; -void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short int,int,int>> & break_points_out) { +void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short int,int,int, int>> & break_points_out) { std::unordered_map<std::pair<short int, int>, int, PairHash<short,int>> break_points; openfpm::vector<BreakPoint> brk; @@ -212,16 +177,8 @@ void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short i int n_cigar = aln->core.n_cigar; // number of cigar operations uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); - short int chr_int; - if (chr[0] == 'X') { - chr_int = 22; - } else if (chr[0] == 'Y') { - chr_int = 23; - } else { - chr_int = (short int)atoi(chr); - if (chr_int == 0) {continue;} - chr_int -= 1; - } + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} for (int k = 0 ; k < n_cigar ; k++) { int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); @@ -264,24 +221,20 @@ void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short i int n_cigar = aln->core.n_cigar; // number of cigar operations uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); - short int chr_int; - if (chr[0] == 'X') { - chr_int = 22; - } else if (chr[0] == 'Y') { - chr_int = 23; - } else { - chr_int = (short int)atoi(chr); - if (chr_int == 0) {continue;} - chr_int -= 1; - } + short int chr_int = chr_to_chr_int(chr); + if (chr_int == -1) {continue;} // Fill depth for (int k = 0 ; k < n_cigar ; k++) { int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); int len = bam_cigar_oplen(cigar[k]); if (op == 'M') { - int bid = find_breakpoint(brk,chr_int,pos,pos+len); - brk.get(bid).depth++; + int b1 = 0; + int b2 = 0; + find_breakpoint(brk,chr_int,pos,pos+len,b1,b2); + for (int i = b1 ; i < b2 ; i++) { + brk.get(i).depth++; + } } else { pos += len; @@ -293,15 +246,6 @@ void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short i sam_close(fp_in); } - openfpm::vector<int> id_to_remove; - // Filter breakpoints by depth - for (int i = 0 ; i < brk.size() ; ++i) { - if (brk.get(i).depth*0.1 >= 3 && brk.get(i).depth*0.1 > brk.get(i).count) { - id_to_remove.add(i); - } - } - brk.remove(id_to_remove); - // valid break are all points @@ -317,111 +261,71 @@ void read_break_points(std::string & bam_file, openfpm::vector<aggregate<short i short int chr; int pos_sum = 0; int count_sum = 0; + int depth_sum = 0; int k; for (k = i ; k < brk.size() && fabs(break_points_out.last().get<1>() - brk.get(k).pos) < 100 ; k++) { chr = brk.get(k).chr; pos_sum += brk.get(k).pos; count_sum += brk.get(k).count; + depth_sum += brk.get(k).depth; } int pos = std::round(pos_sum / (k - i)); + int depth = std::round(depth_sum / (k - i)); break_points_out.last().get<0>() = chr; break_points_out.last().get<1>() = pos; break_points_out.last().get<2>() = count_sum; + break_points_out.last().get<3>() = depth_sum; if (k < brk.size()) { break_points_out.add(); break_points_out.last().get<0>() = brk.get(k).chr; break_points_out.last().get<1>() = brk.get(k).pos; break_points_out.last().get<2>() = brk.get(k).count; + break_points_out.last().get<3>() = brk.get(k).depth; } i = k; } -} -void read_sample(std::string bam_file, - openfpm::vector<int> & base_count, - openfpm::vector<int> & chr_count, - openfpm::vector<aggregate<short int,int,int>> & intervals, - int sample, - int N_samples) { + openfpm::vector<int> id_to_remove; + // Filter breakpoints by depth + for (int i = 0 ; i < break_points_out.size() ; ++i) { + if (break_points_out.get<3>(i)*0.1 >= 3 && break_points_out.get<3>(i)*0.1 > break_points_out.get<2>(i)) { + id_to_remove.add(i); + } + } + break_points_out.remove(id_to_remove); +} - samFile *fp_in = hts_open(bam_file.c_str(),"r"); //open bam file - bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header - bam1_t *aln = bam_init1(); //initialize an alignment +void read_bed_file(std::string bed_f, openfpm::vector<aggregate<short int,int,int>> & intervals,int pad) { + std::ifstream bedFile(bed_f.c_str()); - while(sam_read1(fp_in,bamHdr,aln) > 0){ - - int32_t pos = aln->core.pos +1; //left most position of alignment in zero based coordianate (+1) - if (aln->core.tid == -1) - {continue;} - char *chr = bamHdr->target_name[aln->core.tid] ; //contig name (chromosome) - uint32_t len = aln->core.l_qseq; //length of the read. - int n_cigar = aln->core.n_cigar; // number of cigar operations - uint32_t * cigar = (uint32_t *)(aln->data + aln->core.l_qname); + if (!bedFile.is_open()) { + std::cerr << "Error: Unable to open the BED file." << std::endl; + return; + } - short int chr_int; - if (chr[0] == 'X') { - chr_int = 22; - } else if (chr[0] == 'Y') { - chr_int = 23; - } else { - chr_int = (short int)atoi(chr); - if (chr_int == 0) {continue;} - chr_int -= 1; - } + std::string line; - chr_count.get(N_samples*chr_int + sample) += len; + char str[256]; - int j = 0; - int interval = 0; - int interval_end = 0; - - + while (std::getline(bedFile, line)) { + std::istringstream iss(line); - for (int k = 0 ; k < n_cigar ; k++) { - int op = bam_cigar_opchr(bam_cigar_op(cigar[k])); - int len = bam_cigar_oplen(cigar[k]); - find_interval(intervals,chr_int,pos,interval); - find_interval(intervals,chr_int,pos+len,interval_end); - if (op == 'M') { - if (interval != -1) { - add_to_interval(base_count,intervals,interval,sample,N_samples,pos,len); - } - if (interval_end != -1) { - add_to_interval(base_count,intervals,interval_end,sample,N_samples,pos,len); - } - } - else if (op == 'D' || op == 'N') {pos += len;} - else { - pos += len; - } - } - } - - bam_destroy1(aln); - sam_close(fp_in); -} + iss >> str; + int chr_int = chr_to_chr_int(str); + if (chr_int == -1) {continue;} -template<typename T> T CN_factor(T & CN) { - if (CN > 1.9) { - if (CN <= 2.0) - {return 1.0 - 5.0*(CN-1.9);} - else if (CN > 2.0) { - if (CN < 2.1) { - return 0.5 + 5.0*(CN - 2.0); - } - } + intervals.add(); + intervals.last().get<0>() = chr_int; + iss >> intervals.last().get<1>(); + iss >> intervals.last().get<2>(); } - - return 1.0; + bedFile.close(); } -template<typename T> T CN_offset(T & CN) { - return (CN - 2.0f)*(CN - 2.0f); -} template<typename T> T penalization_fishing(T & a, T left, T right) { T min_distance = std::min(fabs(a-left)*1000,fabs(a-right)*1000); @@ -444,43 +348,22 @@ int main(int argc, char* argv[]) openfpm::vector<long int> number_or_reads; short int chromosome=4; - int start = 140213968; - int end = 140252774; openfpm::vector<aggregate<short int,int, int>> break_point; openfpm::vector<aggregate<short int,int, int>> intervals; - openfpm::vector<aggregate<short int,int, int>> read_splits; - - intervals.add(); - intervals.last().get<0>() = chromosome; - intervals.last().get<1>() = 140213968; - intervals.last().get<2>() = 140216338; - - intervals.add(); - intervals.last().get<0>() = chromosome; - intervals.last().get<1>() = 140220906; - intervals.last().get<2>() = 140223351; + openfpm::vector<aggregate<short int,int, int, int>> read_splits; - intervals.add(); - intervals.last().get<0>() = chromosome; - intervals.last().get<1>() = 140228080; - intervals.last().get<2>() = 140230609; + // Read bed file for interval - intervals.add(); - intervals.last().get<0>() = chromosome; - intervals.last().get<1>() = 140235633; - intervals.last().get<2>() = 140238168; + read_bed_file("/nvme/bams/Twist_Exome_Core_RefSeq_Targets_b37_chr.bed",intervals,0); - intervals.add(); - intervals.last().get<0>() = chromosome; - intervals.last().get<1>() = 140248688; - intervals.last().get<2>() = 140251121; + // Calculate the number of bases in the interval - intervals.add(); - intervals.last().get<0>() = chromosome; - intervals.last().get<1>() = 140255057; - intervals.last().get<2>() = 140257436; + int n_bases = 0; + for (int i = 0 ; i < intervals.size() ; i++) { + n_bases += intervals.get<2>(i) - intervals.get<1>(i); + } openfpm::vector<int> chr_counts; openfpm::vector<int> counts; @@ -488,11 +371,17 @@ int main(int argc, char* argv[]) openfpm::vector<std::string> files_bams; - files_bams.add("/nvme/bams/DE78NGSUKBD134462_76322_cut.bam"); - files_bams.add("/nvme/bams/DE47NGSUKBD116105_76321_cut.bam"); - files_bams.add("/nvme/bams/DE06NGSUKBD138183_97987_cut.bam"); - files_bams.add("/nvme/bams/DE07NGSUKBD138165_96729_cut.bam"); - files_bams.add("/nvme/bams/DE12NGSUKBD138172_95188_cut.bam"); + files_bams.add("/nvme/bams/DE14NGSUKBD138718_97851_cut.bam"); + files_bams.add("/nvme/bams/DE12NGSUKBD138851_96029_cut.bam"); + files_bams.add("/nvme/bams/DE06NGSUKBD138862_98760_cut.bam"); + files_bams.add("/nvme/bams/DE08NGSUKBD138729_77978_cut.bam"); + files_bams.add("/nvme/bams/DE11NGSUKBD138869_97627_cut.bam"); + + files_bams.add("/nvme/bams/batch_1/DE78NGSUKBD134462_76322_cut.bam"); + files_bams.add("/nvme/bams/batch_1/DE47NGSUKBD116105_76321_cut.bam"); + files_bams.add("/nvme/bams/batch_1/DE06NGSUKBD138183_97987_cut.bam"); + files_bams.add("/nvme/bams/batch_1/DE07NGSUKBD138165_96729_cut.bam"); + files_bams.add("/nvme/bams/batch_1/DE12NGSUKBD138172_95188_cut.bam"); int N_samples = files_bams.size(); chr_counts.resize(N_chr*N_samples); @@ -510,34 +399,16 @@ int main(int argc, char* argv[]) read_sample(files_bams.get(i),counts,chr_counts,intervals,i,N_samples); } - // Save the status, the program is doomed because of htslib is bugged or I do not understand how to use the library avoiding corruption - // P.S. Good library in general try to prevent this - -/* intervals.load("intervals.dat"); - counts.load("count.dat"); - chr_counts.load("chr_count.dat");*/ - + // count by depth int count = 0; - for (int j = 0 ; j < intervals.size() ; j++) { - std::cout << "Interval number of bases: "; for (int i = 0 ; i < N_samples ; i++) { counts.get(N_samples*j + i) /= intervals.get<2>(j) - intervals.get<1>(j); - std::cout << counts.get(N_samples*j + i) << " "; - } - std::cout << std::endl; - } - - for (int j = 0 ; j < N_chr ; j++) { - std::cout << "chr " << j << " "; - for (int i = 0 ; i < N_samples ; i++) { - std::cout << " " << chr_counts.get(N_samples*j + i); } - std::cout << std::endl; } constexpr int N_int_opt = 1; - constexpr int N_sample_opt = 5; + constexpr int N_sample_opt = 10; constexpr int dim = N_int_opt + N_sample_opt; Box<dim,double> domain; @@ -553,190 +424,144 @@ int main(int argc, char* argv[]) domain.setHigh(i,6.0); } - // optimize the model function - - int max_count = 0; - for (int i = 0 ; i < counts.size() ; i++) { - if (counts.get(i) > max_count) { - max_count = counts.get(i); - } - } - - int max_chr = 0; - for (int i = 0 ; i < chr_counts.size() ; i++) { - if (chr_counts.get(i) > max_chr) { - max_chr = chr_counts.get(i); - } - } - // Divide chromosome to get max_count - double factor = (double)max_count / max_chr; - - std::cout << "FACTOR: " << factor << std::endl; + double factor = (double)1.0 / (double)n_bases; for (int j = 0 ; j < N_sample_opt ; j++) { chr_counts.get(chromosome*N_samples+j)*=factor; } - std::cout << "----------------------------" << std::endl; + auto & v_cl = create_vcluster(); + if (v_cl.rank() == 0) { - for (int i = 0 ; i < N_samples ; i++) { for (int j = 0 ; j < N_chr ; j++) { - std::cout << "sample " << i << " chr: " << j << std::endl; - std::cout << "Bases: " << chr_counts.get(N_samples*j + i) << std::endl; + std::cout << "chr " << j << " "; + for (int i = 0 ; i < N_samples ; i++) { + std::cout << " " << chr_counts.get(N_samples*j + i); + } + std::cout << std::endl; } } openfpm::vector<aggregate<float[16]>> solutions; + openfpm::vector<aggregate<float[16]>> confidence; solutions.resize(intervals.size()); + confidence.resize(intervals.size()); + int n_interval_to_process = 0; for (int i = 0 ; i < intervals.size() ; i++) { + bool is_zero = true; + for (int j = 0 ; j < N_samples; j++) { + if (counts.get(N_samples*i + j) != 0) { + is_zero = false; + } + } + if (is_zero == true) {continue;} + n_interval_to_process++; + } - std::cout << "----------------------------" << std::endl; - std::cout << "INTERVALS: " << i << std::endl; - std::cout << "----------------------------" << std::endl; + std::ofstream fout; + if (v_cl.rank() == 0) { + // Open a TSV file for writing + fout.open("calls.tsv"); + if (!fout.is_open()) { + std::cerr << "Failed to open the file for writing.\n"; + return 1; + } + fout << "chromosome" << '\t' << "start" << '\t' << "stop"; + for (int j = 0 ; j < N_sample_opt ; j++) { + fout << '\t' << basename(files_bams.get(j)); + } + fout << '\t' << "value" << '\t' << "CALL" << std::endl; + } - double reg = max_count / 0.1; + int interval_processed = 0; + for (int i = 0 ; i < intervals.size() ; i++) { + float reg = 2000; int selected_interval = i; double fishing_left = 0.5; double fishing_right = 0.5; auto lamb_eval = [&](Eigen::VectorXd & vars){ - - double val = 0.0; - - for (int j = 0 ; j < N_sample_opt ; j++) { - for (int i = selected_interval,k=0 ; i < selected_interval + N_int_opt ; i++, k++ ) { - if (vars(k) < 0.0 || vars(N_int_opt+j) < 0.0) { - val += domain.getHigh(0)*domain.getHigh(N_int_opt)*chr_counts.get(chromosome*N_samples+j); - std::cout << "OUT OF BOUND" << std::endl; - } else { - val += fabs(vars(k)*chr_counts.get(chromosome*N_samples+j)*vars(N_int_opt+j) - counts.get(i*N_samples + j))*CN_factor(vars(N_int_opt+j)) + 2.0f*CN_offset(vars(N_int_opt+j)); - std::cout << "External: " << vars(k)*chr_counts.get(chromosome*N_samples+j)*vars(N_int_opt+j) - counts.get(i*N_samples + j) << "*" << CN_factor(vars(N_int_opt+j)) << "+" << 2.0f*CN_offset(vars(N_int_opt+j)) << std::endl; - } - - - // near integer - int near_int = std::round(vars(N_int_opt+j)); - - // Penalization for CN not being an integer - val += fabs(vars(N_int_opt+j) - near_int) * reg; - - // Penalization for not being near the other regions for fishing - //val += penalization_fishing(vars(k),fishing_left,fishing_right); - } - } - - // Penalization for out of bound parameters - for (int i = 0 ; i < N_sample_opt + N_int_opt ; i++) { - if (vars(i) < domain.getLow(i)) { - val += (domain.getLow(i) - vars(i))*20000.0; - } - if (vars(i) > domain.getHigh(i)) { - val += (vars(i) - domain.getHigh(i))*20000.0; - } - } - - //std::cout << "VALUE " << val << " " << vars(0) << " " << vars(1) << " " << vars(2) << vars(3) << " " << vars(4) << " " << vars(5) << " || " << vars(6) << " " << vars(7) << std::endl; - - #ifdef DEBUG_PRINT - std::cout << "VALUE " << val; - for (int i = 0 ; i < N_sample_opt + N_int_opt; i++) { - std::cout << vars(i) - } - std::cout << std::endl; - #endif - - return val; + return model_function<true,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); }; auto lamb = [&](Eigen::VectorXd & vars){ + return model_function<false,dim,N_int_opt,N_sample_opt>(selected_interval,vars,domain,chr_counts,counts,chromosome,reg); + }; - double val = 0.0; - - for (int j = 0 ; j < N_sample_opt ; j++) { - for (int i = selected_interval,k=0 ; i < selected_interval + N_int_opt ; i++, k++ ) { - if (vars(k) < 0.0 || vars(N_int_opt+j) < 0.0) { - val += domain.getHigh(0)*domain.getHigh(N_int_opt)*chr_counts.get(chromosome*N_samples+j); - } else { - val += fabs(vars(k)*chr_counts.get(chromosome*N_samples+j)*vars(N_int_opt+j) - counts.get(i*N_samples + j))*CN_factor(vars(N_int_opt+j)) + 2.0f*CN_offset(vars(N_int_opt+j)); - } - - - // near integer - int near_int = std::round(vars(N_int_opt+j)); + // Check there is data - // Penalization for CN not being an integer - val += fabs(vars(N_int_opt+j) - near_int) * reg; + bool is_zero = true; + for (int j = 0 ; j < N_samples; j++) { + if (counts.get(N_samples*i + j) != 0) { + is_zero = false; + } + } + if (is_zero == true) {continue;} - // Penalization for not being near the other regions for fishing - //val += penalization_fishing(vars(k),fishing_left,fishing_right); - } + openfpm::vector<double> sol; + if (v_cl.rank() == 0) { + std::cout << "Processing interval: " << interval_processed << "/" << n_interval_to_process << std::endl; + std::cout << "Chromosome: " << intervals.get<0>(i)+1 << " start: " << intervals.get<1>(i) << " stop: " << intervals.get<2>(i) << std::endl; + } + optimize(domain,0.0,sol,lamb); + if (v_cl.rank() == 0) {std::cout << std::endl;} + interval_processed++; + bool is_call = false; + fout << intervals.get<0>(i)+1 << '\t' << intervals.get<1>(i) << '\t' << intervals.get<2>(i); + for (int k = 0 ; k < sol.size() ; k++) { + solutions.get<0>(i)[k] = sol.get(k); + if (k != 0 && std::round(solutions.get<0>(i)[k]) != 2) { + is_call = true; } + if (v_cl.rank() == 0) + {fout << '\t' << solutions.get<0>(i)[k];} + } + Eigen::VectorXd vars; + vars.resize(sol.size()); + for (int s = 0 ; s < sol.size() ; s++) { + vars(s) = sol.get(s); + } - // Penalization for out of bound parameters - for (int i = 0 ; i < N_sample_opt + N_int_opt ; i++) { - if (vars(i) < domain.getLow(i)) { - val += (domain.getLow(i) - vars(i))*20000.0; - } - if (vars(i) > domain.getHigh(i)) { - val += (vars(i) - domain.getHigh(i))*20000.0; - } + // calculate confidence + if (v_cl.rank() == 0) { + std::cout << "CONFIDENCE: "; + for (int i = N_int_opt ; i < N_int_opt + N_sample_opt ; i++) { + confidence.get<0>(selected_interval)[i] = model_function_confidence<N_int_opt, N_sample_opt>(selected_interval,i-N_int_opt,vars,chr_counts,counts,chromosome); + std::cout << confidence.get<0>(selected_interval)[i] << " "; } + std::cout << std::endl; + } - // Penalization for majority of samples CN bigger than two - auto majority = (float)N_sample_opt/2.0; - for (int i = 0 ; i < N_sample_opt ; i++) { - majority -= (vars(N_int_opt+i) - 2); + if (v_cl.rank() == 0) + {fout << '\t' << lamb(vars);} + if (v_cl.rank() == 0) { + if (is_call == true) { + std::cout << "----------------------------------------------------------------------------------------" << std::endl; + std::cout << "-----------------------------------CALL-------------------------------------------------" << std::endl; + std::cout << "----------------------------------------------------------------------------------------" << std::endl; + fout << '\t' << "CALL" << std::endl; } - if (majority < 0) { - majority = fabs(majority); - } else { - majority = 0.0; + else { + fout << '\t' << "." << std::endl; } - val += majority*1000.0; - - //std::cout << "VALUE " << val << " " << vars(0) << " " << vars(1) << " " << vars(2) << vars(3) << " " << vars(4) << " " << vars(5) << " || " << vars(6) << " " << vars(7) << std::endl; - - #ifdef DEBUG_PRINT - std::cout << "VALUE " << val; - for (int i = 0 ; i < N_sample_opt + N_int_opt; i++) { - std::cout << vars(i) + for (int j = 0 ; j < N_sample_opt ; j++) { + std::cout << basename(files_bams.get(j)) << " " << solutions.get<0>(i)[1+j] << " "; } - std::cout << std::endl; - #endif - - return val; - }; - - // // Check solution - // if (i == 1) { - // Eigen::VectorXd vars(6); - // vars(0) = 0.351632*3.0/2.0; - // vars(1) = 1; - // vars(2) = 2; - // vars(3) = 2; - // vars(4) = 2; - // vars(5) = 2; - // std::cout << "WANTED VALUE: " << lamb_eval(vars) << std::endl; - // } - - openfpm::vector<double> sol; - optimize(domain,0.0,sol,lamb); - for (int k = 0 ; k < sol.size() ; k++) { - solutions.get<0>(i)[k] = sol.get(k); } + fout << std::flush; } // For each solutions - for (int i = 0 ; i < intervals.size() ; i++) { +/* for (int i = 0 ; i < intervals.size() ; i++) { std::cout << "Chromosome: " << intervals.get<0>(i) << " start: " << intervals.get<1>(i) << " stop: " << intervals.get<2>(i) << " bait efficency: " << solutions.get<0>(i)[0] << " CNs "; for (int j = 0 ; j < N_sample_opt ; j++) { std::cout << basename(files_bams.get(j)) << " " << solutions.get<0>(i)[1+j] << " "; } std::cout << std::endl; - } + }*/ openfpm_finalize(); } diff --git a/example/Sequencing/CNVcaller/stat_calls.py b/example/Sequencing/CNVcaller/stat_calls.py new file mode 100644 index 0000000000000000000000000000000000000000..5ea9a4737889a6696984490dc57add5e3b7f2a1f --- /dev/null +++ b/example/Sequencing/CNVcaller/stat_calls.py @@ -0,0 +1,44 @@ +import np as np +import matplotlib.pyplot as plt + +f = open("calls_material_batch2.tsv","r") + +# read the first line +line = f.readline() +line = line.split('\t') + +samples = line[3:-2] + +stat = [] + +for i in range(0,len(line)-5): + stat.append([0,0,0,0,0,0,0]) + +for line in f.readlines(): + + line = line.split('\t') + + for i,cn in enumerate(line[4:-2]): + stat[i][round(float(cn))] += 1 + + +# print stat + +for i,sample in enumerate(samples): + print( sample + "\t" + "\t".join([str(x) for x in stat[i]])) + +# simulate truncated + +lam = 100 +samples = np.random.poisson(lam, 1000000) + +plt.hist(samples, bins=range(min(samples), max(samples) + 1, 1), alpha=0.75, color='blue', edgecolor='black') + +plt.title('Histogram of Poisson Samples') +plt.xlabel('Number of events') +plt.ylabel('Frequency') + +# Show the plot +plt.show() + +print(samples) \ No newline at end of file