diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu index 55ad9a0517b87ecfb372da57c8b7febdac72470e..2d19dd2a1f2f33488c66fb98eee03029648d8d56 100644 --- a/src/NN/CellList/CellList_gpu_test.cu +++ b/src/NN/CellList/CellList_gpu_test.cu @@ -83,17 +83,16 @@ void test_sub_index() no_transform_only<dim,T> t(mt,pt); + CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div, spacing, off, t, - pl.capacity(), pl.size(), - part_ids.capacity(), 0, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + cl_n.toKernel(), + part_ids.toKernel()); cl_n.template deviceToHost<0>(); part_ids.template deviceToHost<0>(); @@ -203,17 +202,16 @@ void test_sub_index2() shift_only<dim,T> t(mt,pt); + CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div, spacing, off, t, - pl.capacity(), pl.size(), - part_ids.capacity(), 0, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + cl_n.toKernel(), + part_ids.toKernel()); cl_n.template deviceToHost<0>(); part_ids.template deviceToHost<0>(); @@ -387,11 +385,10 @@ void test_fill_cell() div_c, off, part_ids.size(), - part_ids.capacity(), 0, - static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); + starts.toKernel(), + part_ids.toKernel(), + cells.toKernel() ); cells.template deviceToHost<0>(); @@ -666,7 +663,8 @@ void test_reorder_parts(size_t n_part) // Here we test fill cell CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()), decltype(pl.toKernel()), - decltype(sort_to_not_sort.toKernel()), + decltype(sort_to_not_sort.toKernel()), + decltype(cells_out.toKernel()), cnt_type, shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(), parts_prp.toKernel(), @@ -675,7 +673,7 @@ void test_reorder_parts(size_t n_part) pl_out.toKernel(), sort_to_not_sort.toKernel(), non_sort_to_sort.toKernel(), - static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>())); + cells_out.toKernel()); bool check = true; parts_prp_out.template deviceToHost<0>(); diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp index defee388cdefe38e20acf4e8a7397863c6ff2b70..5ee0e2b043520674e05d0e7a35456865dc707a29 100644 --- a/src/NN/CellList/cuda/CellList_gpu.hpp +++ b/src/NN/CellList/cuda/CellList_gpu.hpp @@ -261,13 +261,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> spacing_c, off, this->getTransform(), - pl.capacity(), pl.size(), - part_ids.capacity(), start, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + starts.toKernel(), + part_ids.toKernel()); // now we construct the cells @@ -306,6 +304,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()), decltype(pl.toKernel()), decltype(sorted_to_not_sorted.toKernel()), + decltype(cells.toKernel()), cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(), pl_prp.toKernel(), pl_prp_out.toKernel(), @@ -313,7 +312,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> pl_out.toKernel(), sorted_to_not_sorted.toKernel(), non_sorted_to_sorted.toKernel(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); + cells.toKernel()); if (opt == cl_construct_opt::Full) { @@ -391,13 +390,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> spacing_c, off, this->getTransform(), - pl.capacity(), stop, - part_ids.capacity(), start, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + cl_n.toKernel(), + part_ids.toKernel()); // now we scan starts.resize(cl_n.size()); @@ -413,7 +410,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0, part_ids.size(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); + cells.toKernel()) ); // sort @@ -422,14 +419,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> #else CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0, - div_c, - off, - part_ids.size(), - part_ids.capacity(), - start, - static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); + div_c, + off, + part_ids.size(), + start, + starts.toKernel(), + part_ids.toKernel(), + cells.toKernel() ); #endif @@ -445,6 +441,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()), decltype(pl.toKernel()), decltype(sorted_to_not_sorted.toKernel()), + decltype(cells.toKernel()), cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(), pl_prp.toKernel(), pl_prp_out.toKernel(), @@ -452,7 +449,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> pl_out.toKernel(), sorted_to_not_sorted.toKernel(), non_sorted_to_sorted.toKernel(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); + cells.toKernel()); } else { @@ -460,6 +457,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()), decltype(pl.toKernel()), decltype(sorted_to_not_sorted.toKernel()), + decltype(cells.toKernel()), cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(), pl_prp.toKernel(), pl_prp_out.toKernel(), @@ -467,7 +465,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> pl_out.toKernel(), sorted_to_not_sorted.toKernel(), non_sorted_to_sorted.toKernel(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); + cells.toKernel()); } if (opt == cl_construct_opt::Full) diff --git a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp index 0d7c86a9fea377d0e6f538c36c46d7db5f3b29eb..e50335e7a2837e4fba52cbdb51fc8243a76341a8 100644 --- a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp +++ b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp @@ -258,18 +258,17 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid) #ifdef __NVCC__ template<bool is_sparse,unsigned int dim, typename pos_type, - typename cnt_type, typename ids_type, typename transform> + typename cnt_type, typename ids_type, typename transform, + typename vector_pos_type, typename vector_cnt_type, typename vector_pids_type> __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div, openfpm::array<pos_type,dim,cnt_type> spacing, openfpm::array<ids_type,dim,cnt_type> off, transform t, - int n_cap, int n_part, - int n_cap2, cnt_type start, - pos_type * p_pos, - cnt_type *counts, - cnt_type * p_ids) + vector_pos_type p_pos, + vector_cnt_type counts, + vector_pids_type p_ids) { cnt_type i, cid, ins; ids_type e[dim+1]; @@ -281,23 +280,23 @@ __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div, pos_type p[dim]; for (size_t k = 0 ; k < dim ; k++) - {p[k] = p_pos[i+k*n_cap];} + {p[k] = p_pos.template get<0>(i)[k];} cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e); if (is_sparse == false) { - e[dim] = atomicAdd(counts + cid, 1); + e[dim] = atomicAdd(&counts.template get<0>(cid), 1); - p_ids[ins] = cid; - {p_ids[ins+1*(n_cap2)] = e[dim];} + p_ids.template get<0>(ins)[0] = cid; + {p_ids.template get<0>(ins)[1] = e[dim];} } else { - p_ids[ins] = cid; - {p_ids[ins+1*(n_cap2)] = e[dim];} + p_ids.template get<0>(ins)[0] = cid; + {p_ids.template get<0>(ins)[1] = e[dim];} - counts[ins] = cid; + counts.template get<0>(ins) = cid; } } @@ -353,43 +352,42 @@ __global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div #ifdef MAKE_CELLLIST_DETERMINISTIC -template<unsigned int dim, typename cnt_type, typename ids_type, typename ph> +template<unsigned int dim, typename cnt_type, typename ids_type, typename ph, typename vector_cells_type> __global__ void fill_cells(cnt_type phase_id , cnt_type n, - cnt_type *cells) + vector_cells_type cells) { cnt_type i; i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cells[i] = encode_phase_id<cnt_type,ph>(phase_id,i); + cells.template get<0>(i) = encode_phase_id<cnt_type,ph>(phase_id,i); } #else -template<unsigned int dim, typename cnt_type, typename ids_type, typename ph> +template<unsigned int dim, typename cnt_type, typename ids_type, typename ph,typename vector_starts_type, typename vector_pids_type, typename vector_cells_type> __global__ void fill_cells(cnt_type phase_id , openfpm::array<ids_type,dim,cnt_type> div_c, openfpm::array<ids_type,dim,cnt_type> off, cnt_type n, - cnt_type n_cap, cnt_type start_p, - const cnt_type *starts, - const cnt_type * p_ids, - cnt_type *cells) + vector_starts_type starts, + vector_pids_type p_ids, + vector_cells_type cells) { cnt_type i, cid, id, start; i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cid = p_ids[i]; + cid = p_ids.template get<0>(i)[0]; - start = starts[cid]; - id = start + p_ids[1*n_cap+i]; + start = starts.template get<0>(cid); + id = start + p_ids.template get<0>(i)[1]; - cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p); + cells.template get<0>(id) = encode_phase_id<cnt_type,ph>(phase_id,i + start_p); } #endif @@ -424,7 +422,7 @@ __device__ inline void reorder_wprp(const vector_prp & input, output.template set<prp ...>(dst_id,input,src_id); } -template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh> +template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh> __global__ void reorder_parts(int n, const vector_prp input, vector_prp output, @@ -432,12 +430,12 @@ __global__ void reorder_parts(int n, vector_pos output_pos, vector_ns sorted_non_sorted, vector_ns non_sorted_to_sorted, - const cnt_type * cells) + const vector_cells_type cells) { cnt_type i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cnt_type code = cells[i]; + cnt_type code = cells.template get<0>(i); reorder(input, output, code,i); reorder(input_pos,output_pos,code,i); @@ -446,7 +444,7 @@ __global__ void reorder_parts(int n, non_sorted_to_sorted.template get<0>(code) = i; } -template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh, unsigned int ... prp> +template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh, unsigned int ... prp> __global__ void reorder_parts_wprp(int n, const vector_prp input, vector_prp output, @@ -454,12 +452,12 @@ __global__ void reorder_parts_wprp(int n, vector_pos output_pos, vector_ns sorted_non_sorted, vector_ns non_sorted_to_sorted, - const cnt_type * cells) + const vector_cells_type cells) { cnt_type i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cnt_type code = cells[i]; + cnt_type code = cells.template get<0>(i); reorder_wprp<vector_prp,cnt_type,prp...>(input, output, code,i); reorder(input_pos,output_pos,code,i); diff --git a/src/Vector/vector_subset.hpp b/src/Vector/vector_subset.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d6d78b69467911a22f3384f8d1d1a097c714a1bd --- /dev/null +++ b/src/Vector/vector_subset.hpp @@ -0,0 +1,67 @@ +#ifndef VECTOR_SUBSET_HPP +#define VECTOR_SUBSET_HPP + +namespace openfpm +{ + +template<unsigned int dim, + typename prop, + template<typename> class layout_base = memory_traits_inte> + class vector_subset_ker + { + mutable openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> v_all; + + mutable openfpm::vector_gpu_ker<aggregate<int>,layout_base> indexes; + + public: + + vector_subset_ker(openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> & v_all, + openfpm::vector_gpu_ker<aggregate<int>,layout_base> & indexes) + :v_all(v_all),indexes(indexes) + {} + + // get the + + template <unsigned int p> + __device__ __host__ inline auto get(size_t id) const -> decltype(v_all.template get<p>(0)) + { + return v_all.template get<p>(indexes.template get<0>(id)); + } + } + +public: + + template<typename T, typename Memory, template<typename> class layout_base, typename grow_p, unsigned int impl> + class vector_sub + { + vector<T,Memory,layout_base,grow_p,impl> & v_all; + + vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes; + + public: + + vector(vector<T,Memory,layout_base,grow_p,impl> & v_all, + vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes) + :v_all(v_all),indexes(indexes) + {} + + + // To kernel + template<unsigned int ... prp> vector_dist_ker<dim,St,prop,layout_base> toKernel() + { + vector_subset_ker<dim,St,prop,layout_base> v(v_all.toKernel(), indexes.toKernel()); + + return v; + } + + template <unsigned int p> + inline auto get(size_t id) const -> decltype(v_all.template get<p>(0)) + { + return v_all.template get<p>(indexes.template get<0>(id)); + } + + } + +}; + +#endif \ No newline at end of file