From 2ff916ddeacf8dbdeb34a77b5faa5b3239a66317 Mon Sep 17 00:00:00 2001 From: Serhii Yaskovets <yaskovet@sbalzarini-mac-38.mpi-cbg.de> Date: Thu, 27 Apr 2023 12:10:37 +0200 Subject: [PATCH] Revert "Revert "Merge remote-tracking branch 'origin/FD_solver'"" This reverts commit c1c3e0c4e8586bf5061dbabbfa48dd0c2dc86960. --- src/Grid/grid_base_implementation.hpp | 2 +- src/Grid/grid_key.hpp | 2 +- src/Grid/grid_sm.hpp | 27 ++++-- src/NN/CellList/CellList_gpu_test.cu | 30 +++--- .../CellList/cuda/CellDecomposer_gpu_ker.cuh | 95 +++++++++++++++++-- src/NN/CellList/cuda/CellList_gpu.hpp | 89 +++++++++-------- src/NN/CellList/cuda/CellList_gpu_ker.cuh | 29 +++--- .../cuda/Cuda_cell_list_util_func.hpp | 60 ++++++------ src/Space/Shape/Point.hpp | 2 +- .../performance/performancePlots.cpp | 6 +- src/Vector/cuda/map_vector_cuda_ker.cuh | 80 +++++++++++++++- src/Vector/cuda/map_vector_std_cuda.hpp | 15 ++- src/Vector/cuda/map_vector_std_cuda_ker.cuh | 21 ++++ src/Vector/map_vector.hpp | 43 +++++++-- src/Vector/map_vector_std.hpp | 10 ++ src/Vector/vector_subset.hpp | 67 +++++++++++++ src/util/mathutil.hpp | 35 ++++--- 17 files changed, 473 insertions(+), 140 deletions(-) create mode 100644 src/Vector/vector_subset.hpp diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp index 60392952..7a1a6693 100644 --- a/src/Grid/grid_base_implementation.hpp +++ b/src/Grid/grid_base_implementation.hpp @@ -752,7 +752,7 @@ public: { auto key = it.get(); - if (this->get_o(key) != this->get_o(key)) + if (this->get_o(key) != g.get_o(key)) return false; ++it; diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp index 106f7ee5..90fab130 100644 --- a/src/Grid/grid_key.hpp +++ b/src/Grid/grid_key.hpp @@ -397,7 +397,7 @@ public: * */ template<typename a, typename ...T> - inline void set(a v, T...t) + __device__ __host__ inline void set(a v, T...t) { #ifdef SE_CLASS1 if (sizeof...(t) != dim -1) diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp index e4511d29..651608b2 100644 --- a/src/Grid/grid_sm.hpp +++ b/src/Grid/grid_sm.hpp @@ -186,7 +186,7 @@ class grid_sm * */ - inline void Initialize(const size_t sz) + __device__ __host__ inline void Initialize(const size_t sz) { //! Initialize the basic structure for each dimension sz_s[0] = sz; @@ -217,7 +217,7 @@ class grid_sm * */ - inline void Initialize(const size_t (& sz)[N]) + __device__ __host__ inline void Initialize(const size_t (& sz)[N]) { //! Initialize the basic structure for each dimension sz_s[0] = sz[0]; @@ -353,7 +353,8 @@ public: * */ - template<typename S> inline grid_sm(const grid_sm<N,S> & g) + template<typename S> + __device__ __host__ inline grid_sm(const grid_sm<N,S> & g) { size_tot = g.size_tot; @@ -380,7 +381,7 @@ public: // Static element to calculate total size - inline size_t totalSize(const size_t (& sz)[N]) + __device__ __host__ inline size_t totalSize(const size_t (& sz)[N]) { size_t tSz = 1; @@ -415,7 +416,7 @@ public: * */ - inline grid_sm(const size_t (& sz)[N]) + __device__ __host__ inline grid_sm(const size_t (& sz)[N]) : size_tot(totalSize(sz)) { Initialize(sz); @@ -625,7 +626,7 @@ public: } //! Destructor - ~grid_sm() {}; + __device__ __host__ ~grid_sm() {}; /*! \brief Return the size of the grid * @@ -713,7 +714,7 @@ public: * */ - inline size_t size_s(unsigned int i) const + __device__ __host__ inline size_t size_s(unsigned int i) const { return sz_s[i]; } @@ -737,11 +738,21 @@ public: * \return get the size of the grid as an array * */ - inline const size_t (& getSize() const)[N] + __device__ __host__ inline const size_t (& getSize() const)[N] { return sz; } + /*! \brief Returns the size of the grid in the passed array + * + * \return the size of the grid in the passed array + * + */ + __device__ __host__ inline void getSize(size_t (&size) [N]) const + { + for (size_t i = 0; i < N; ++i) size[i] = sz[i]; + } + /*! \brief Return a sub-grid iterator * * Return a sub-grid iterator, to iterate through the grid diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu index 55ad9a05..2d19dd2a 100644 --- a/src/NN/CellList/CellList_gpu_test.cu +++ b/src/NN/CellList/CellList_gpu_test.cu @@ -83,17 +83,16 @@ void test_sub_index() no_transform_only<dim,T> t(mt,pt); + CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div, spacing, off, t, - pl.capacity(), pl.size(), - part_ids.capacity(), 0, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + cl_n.toKernel(), + part_ids.toKernel()); cl_n.template deviceToHost<0>(); part_ids.template deviceToHost<0>(); @@ -203,17 +202,16 @@ void test_sub_index2() shift_only<dim,T> t(mt,pt); + CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div, spacing, off, t, - pl.capacity(), pl.size(), - part_ids.capacity(), 0, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + cl_n.toKernel(), + part_ids.toKernel()); cl_n.template deviceToHost<0>(); part_ids.template deviceToHost<0>(); @@ -387,11 +385,10 @@ void test_fill_cell() div_c, off, part_ids.size(), - part_ids.capacity(), 0, - static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); + starts.toKernel(), + part_ids.toKernel(), + cells.toKernel() ); cells.template deviceToHost<0>(); @@ -666,7 +663,8 @@ void test_reorder_parts(size_t n_part) // Here we test fill cell CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()), decltype(pl.toKernel()), - decltype(sort_to_not_sort.toKernel()), + decltype(sort_to_not_sort.toKernel()), + decltype(cells_out.toKernel()), cnt_type, shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(), parts_prp.toKernel(), @@ -675,7 +673,7 @@ void test_reorder_parts(size_t n_part) pl_out.toKernel(), sort_to_not_sort.toKernel(), non_sort_to_sort.toKernel(), - static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>())); + cells_out.toKernel()); bool check = true; parts_prp_out.template deviceToHost<0>(); diff --git a/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh b/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh index fc108d88..3981ce07 100644 --- a/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh +++ b/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh @@ -29,30 +29,71 @@ class CellDecomposer_gpu_ker //! transformation transform t; + //! Unit box of the Cell list + SpaceBox<dim,T> box_unit; + + //! Grid structure of the Cell list + grid_sm<dim,void> gr_cell; + + //! cell_shift + Point<dim,long int> cell_shift; public: __device__ __host__ CellDecomposer_gpu_ker() {} - __device__ __host__ CellDecomposer_gpu_ker(openfpm::array<T,dim,cnt_type> & spacing_c, - openfpm::array<ids_type,dim,cnt_type> & div_c, - openfpm::array<ids_type,dim,cnt_type> & off, - const transform & t) - :spacing_c(spacing_c),div_c(div_c),off(off),t(t) + __device__ __host__ CellDecomposer_gpu_ker( + openfpm::array<T,dim,cnt_type> & spacing_c, + openfpm::array<ids_type,dim,cnt_type> & div_c, + openfpm::array<ids_type,dim,cnt_type> & off, + const transform & t) + : spacing_c(spacing_c),div_c(div_c),off(off),t(t) {} - __host__ grid_sm<dim,void> getGrid() + __device__ __host__ CellDecomposer_gpu_ker( + openfpm::array<T,dim,cnt_type> & spacing_c, + openfpm::array<ids_type,dim,cnt_type> & div_c, + openfpm::array<ids_type,dim,cnt_type> & off, + const transform & t, + SpaceBox<dim,T> box_unit, + grid_sm<dim,void> gr_cell, + Point<dim,long int> cell_shift) + : spacing_c(spacing_c),div_c(div_c),off(off),t(t), + box_unit(box_unit), gr_cell(gr_cell), cell_shift(cell_shift) + {} + + __device__ __host__ grid_sm<dim,void> getGrid() { size_t sz[dim]; for (size_t i = 0 ; i < dim ; i++) { - sz[i] = div_c[i] + 2*off[i]; + sz[i] = div_c[i]; } return grid_sm<dim,void> (sz); } + __device__ __host__ void getGridSize(size_t (& sz)[dim]) const + { + for (size_t i = 0 ; i < dim ; i++) + { + sz[i] = div_c[i] + 2*off[i]; + } + } + + template<typename ids_type2> + __device__ __host__ mem_id getGridLinId(const grid_key_dx<dim,ids_type2> & gk) const + { + mem_id lid = gk.get(0); + for (mem_id i = 1 ; i < dim ; i++) + { + lid += gk.get(i) * (div_c[i-1] + 2*off[i-1]); + } + + return lid; + } + __device__ __host__ inline grid_key_dx<dim,ids_type> getCell(const Point<dim,T> & xp) const { return cid_<dim,cnt_type,ids_type,transform>::get_cid_key(spacing_c,off,t,xp); @@ -82,6 +123,46 @@ public: { return t; } + + __device__ __host__ inline grid_key_dx<dim> getCellGrid(const T (& pos)[dim]) const + { + grid_key_dx<dim> key; + key.set_d(0,ConvertToID(pos,0)); + + for (size_t s = 1 ; s < dim ; s++) + { + key.set_d(s,ConvertToID(pos,s)); + } + + return key; + } + + __device__ __host__ inline grid_key_dx<dim> getCellGrid(const Point<dim,T> & pos) const + { + grid_key_dx<dim> key; + key.set_d(0,ConvertToID(pos,0)); + + for (size_t s = 1 ; s < dim ; s++) + { + key.set_d(s,ConvertToID(pos,s)); + } + + return key; + } + + __device__ __host__ inline size_t ConvertToID(const T (&x)[dim] ,size_t s) const + { + size_t id = openfpm::math::size_t_floor(t.transform(x,s) / box_unit.getHigh(s)) + off[s]; + id = (id >= gr_cell.size(s))?(gr_cell.size(s)-1-cell_shift.get(s)):id-cell_shift.get(s); + return id; + } + + __device__ __host__ inline size_t ConvertToID(const Point<dim,T> & x ,size_t s, size_t sc = 0) const + { + size_t id = openfpm::math::size_t_floor(t.transform(x,s) / box_unit.getHigh(s)) + off[s]; + id = (id >= gr_cell.size(s))?(gr_cell.size(s)-1-cell_shift.get(s)):id-cell_shift.get(s); + return id; + } }; #endif /* CELLDECOMPOSER_GPU_KER_HPP_ */ diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp index c7623583..5ee0e2b0 100644 --- a/src/NN/CellList/cuda/CellList_gpu.hpp +++ b/src/NN/CellList/cuda/CellList_gpu.hpp @@ -44,7 +44,10 @@ struct CellList_gpu_ker_selector openfpm::array<ids_type,dim,cnt_type> & div_c, openfpm::array<ids_type,dim,cnt_type> & off, const transform & t, - unsigned int g_m) + unsigned int g_m, + const SpaceBox<dim,T>& box_unit, + const grid_sm<dim,void>& gr_cell, + const Point<dim,long int>& cell_shift) { return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse>(starts.toKernel(), sorted_to_not_sorted.toKernel(), @@ -54,7 +57,10 @@ struct CellList_gpu_ker_selector div_c, off, t, - g_m); + g_m, + box_unit, + gr_cell, + cell_shift); } }; @@ -73,10 +79,14 @@ struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector vector_cnt_type & dprt, openfpm::vector<aggregate<int>,Memory,memory_traits_inte> & nnc_rad, openfpm::array<T,dim,cnt_type> & spacing_c, - openfpm::array<ids_type,dim,cnt_type> & div_c, - openfpm::array<ids_type,dim,cnt_type> & off, - const transform & t, - unsigned int g_m) + openfpm::array<ids_type,dim,cnt_type> & div_c, + openfpm::array<ids_type,dim,cnt_type> & off, + const transform & t, + unsigned int g_m, + const SpaceBox<dim,T>& box_unit, + const grid_sm<dim,void>& gr_cell, + const Point<dim,long int>& cell_shift) + { return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,true>(cell_nn.toKernel(), cell_nn_list.toKernel(), @@ -86,7 +96,11 @@ struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector spacing_c, div_c, off, - t,g_m); + t, + g_m, + box_unit, + gr_cell, + cell_shift); } }; @@ -247,13 +261,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> spacing_c, off, this->getTransform(), - pl.capacity(), pl.size(), - part_ids.capacity(), start, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + starts.toKernel(), + part_ids.toKernel()); // now we construct the cells @@ -292,6 +304,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()), decltype(pl.toKernel()), decltype(sorted_to_not_sorted.toKernel()), + decltype(cells.toKernel()), cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(), pl_prp.toKernel(), pl_prp_out.toKernel(), @@ -299,7 +312,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> pl_out.toKernel(), sorted_to_not_sorted.toKernel(), non_sorted_to_sorted.toKernel(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); + cells.toKernel()); if (opt == cl_construct_opt::Full) { @@ -377,13 +390,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> spacing_c, off, this->getTransform(), - pl.capacity(), stop, - part_ids.capacity(), start, - static_cast<T *>(pl.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>())); + pl.toKernel(), + cl_n.toKernel(), + part_ids.toKernel()); // now we scan starts.resize(cl_n.size()); @@ -399,7 +410,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0, part_ids.size(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); + cells.toKernel()) ); // sort @@ -408,14 +419,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> #else CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0, - div_c, - off, - part_ids.size(), - part_ids.capacity(), - start, - static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) ); + div_c, + off, + part_ids.size(), + start, + starts.toKernel(), + part_ids.toKernel(), + cells.toKernel() ); #endif @@ -431,6 +441,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()), decltype(pl.toKernel()), decltype(sorted_to_not_sorted.toKernel()), + decltype(cells.toKernel()), cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(), pl_prp.toKernel(), pl_prp_out.toKernel(), @@ -438,7 +449,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> pl_out.toKernel(), sorted_to_not_sorted.toKernel(), non_sorted_to_sorted.toKernel(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); + cells.toKernel()); } else { @@ -446,6 +457,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()), decltype(pl.toKernel()), decltype(sorted_to_not_sorted.toKernel()), + decltype(cells.toKernel()), cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(), pl_prp.toKernel(), pl_prp_out.toKernel(), @@ -453,7 +465,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> pl_out.toKernel(), sorted_to_not_sorted.toKernel(), non_sorted_to_sorted.toKernel(), - static_cast<cnt_type *>(cells.template getDeviceBuffer<0>())); + cells.toKernel()); } if (opt == cl_construct_opt::Full) @@ -648,14 +660,17 @@ public: cells_nn, cells_nn_list, cl_sparse, - sorted_to_not_sorted, - sorted_domain_particles_ids, - nnc_rad, - spacing_c, - div_c, - off, - this->getTransform(), - g_m); + sorted_to_not_sorted, + sorted_domain_particles_ids, + nnc_rad, + spacing_c, + div_c, + off, + this->getTransform(), + g_m, + this->box_unit, + this->gr_cell, + this->cell_shift); } /*! \brief Clear the structure diff --git a/src/NN/CellList/cuda/CellList_gpu_ker.cuh b/src/NN/CellList/cuda/CellList_gpu_ker.cuh index 004716a7..becb50c5 100644 --- a/src/NN/CellList/cuda/CellList_gpu_ker.cuh +++ b/src/NN/CellList/cuda/CellList_gpu_ker.cuh @@ -449,11 +449,14 @@ public: openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt, openfpm::vector_gpu_ker<aggregate<int>,memory_traits_inte> rad_cells, openfpm::array<T,dim,cnt_type> & spacing_c, - openfpm::array<ids_type,dim,cnt_type> & div_c, - openfpm::array<ids_type,dim,cnt_type> & off, - const transform & t, - unsigned int g_m) - :CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t), + openfpm::array<ids_type,dim,cnt_type> & div_c, + openfpm::array<ids_type,dim,cnt_type> & off, + const transform & t, + unsigned int g_m, + SpaceBox<dim,T> box_unit, + grid_sm<dim,void> gr_cell, + Point<dim,long int> cell_shift) + :CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t,box_unit,gr_cell,cell_shift), starts(starts),srt(srt),dprt(dprt),rad_cells(rad_cells),g_m(g_m) { } @@ -616,12 +619,16 @@ public: openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> srt, openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt, openfpm::array<T,dim,cnt_type> & spacing_c, - openfpm::array<ids_type,dim,cnt_type> & div_c, - openfpm::array<ids_type,dim,cnt_type> & off, - const transform & t, - unsigned int g_m) - :CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t),cell_nn(cell_nn),cell_nn_list(cell_nn_list),srt(srt),dprt(dprt), - cl_sparse(cl_sparse),g_m(g_m) + openfpm::array<ids_type,dim,cnt_type> & div_c, + openfpm::array<ids_type,dim,cnt_type> & off, + const transform & t, + unsigned int g_m, + SpaceBox<dim,T> box_unit, + grid_sm<dim,void> gr_cell, + Point<dim,long int> cell_shift) + + :CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t,box_unit,gr_cell,cell_shift), + cell_nn(cell_nn),cell_nn_list(cell_nn_list),srt(srt),dprt(dprt),cl_sparse(cl_sparse),g_m(g_m) { } diff --git a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp index 0d7c86a9..e50335e7 100644 --- a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp +++ b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp @@ -258,18 +258,17 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid) #ifdef __NVCC__ template<bool is_sparse,unsigned int dim, typename pos_type, - typename cnt_type, typename ids_type, typename transform> + typename cnt_type, typename ids_type, typename transform, + typename vector_pos_type, typename vector_cnt_type, typename vector_pids_type> __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div, openfpm::array<pos_type,dim,cnt_type> spacing, openfpm::array<ids_type,dim,cnt_type> off, transform t, - int n_cap, int n_part, - int n_cap2, cnt_type start, - pos_type * p_pos, - cnt_type *counts, - cnt_type * p_ids) + vector_pos_type p_pos, + vector_cnt_type counts, + vector_pids_type p_ids) { cnt_type i, cid, ins; ids_type e[dim+1]; @@ -281,23 +280,23 @@ __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div, pos_type p[dim]; for (size_t k = 0 ; k < dim ; k++) - {p[k] = p_pos[i+k*n_cap];} + {p[k] = p_pos.template get<0>(i)[k];} cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e); if (is_sparse == false) { - e[dim] = atomicAdd(counts + cid, 1); + e[dim] = atomicAdd(&counts.template get<0>(cid), 1); - p_ids[ins] = cid; - {p_ids[ins+1*(n_cap2)] = e[dim];} + p_ids.template get<0>(ins)[0] = cid; + {p_ids.template get<0>(ins)[1] = e[dim];} } else { - p_ids[ins] = cid; - {p_ids[ins+1*(n_cap2)] = e[dim];} + p_ids.template get<0>(ins)[0] = cid; + {p_ids.template get<0>(ins)[1] = e[dim];} - counts[ins] = cid; + counts.template get<0>(ins) = cid; } } @@ -353,43 +352,42 @@ __global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div #ifdef MAKE_CELLLIST_DETERMINISTIC -template<unsigned int dim, typename cnt_type, typename ids_type, typename ph> +template<unsigned int dim, typename cnt_type, typename ids_type, typename ph, typename vector_cells_type> __global__ void fill_cells(cnt_type phase_id , cnt_type n, - cnt_type *cells) + vector_cells_type cells) { cnt_type i; i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cells[i] = encode_phase_id<cnt_type,ph>(phase_id,i); + cells.template get<0>(i) = encode_phase_id<cnt_type,ph>(phase_id,i); } #else -template<unsigned int dim, typename cnt_type, typename ids_type, typename ph> +template<unsigned int dim, typename cnt_type, typename ids_type, typename ph,typename vector_starts_type, typename vector_pids_type, typename vector_cells_type> __global__ void fill_cells(cnt_type phase_id , openfpm::array<ids_type,dim,cnt_type> div_c, openfpm::array<ids_type,dim,cnt_type> off, cnt_type n, - cnt_type n_cap, cnt_type start_p, - const cnt_type *starts, - const cnt_type * p_ids, - cnt_type *cells) + vector_starts_type starts, + vector_pids_type p_ids, + vector_cells_type cells) { cnt_type i, cid, id, start; i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cid = p_ids[i]; + cid = p_ids.template get<0>(i)[0]; - start = starts[cid]; - id = start + p_ids[1*n_cap+i]; + start = starts.template get<0>(cid); + id = start + p_ids.template get<0>(i)[1]; - cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p); + cells.template get<0>(id) = encode_phase_id<cnt_type,ph>(phase_id,i + start_p); } #endif @@ -424,7 +422,7 @@ __device__ inline void reorder_wprp(const vector_prp & input, output.template set<prp ...>(dst_id,input,src_id); } -template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh> +template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh> __global__ void reorder_parts(int n, const vector_prp input, vector_prp output, @@ -432,12 +430,12 @@ __global__ void reorder_parts(int n, vector_pos output_pos, vector_ns sorted_non_sorted, vector_ns non_sorted_to_sorted, - const cnt_type * cells) + const vector_cells_type cells) { cnt_type i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cnt_type code = cells[i]; + cnt_type code = cells.template get<0>(i); reorder(input, output, code,i); reorder(input_pos,output_pos,code,i); @@ -446,7 +444,7 @@ __global__ void reorder_parts(int n, non_sorted_to_sorted.template get<0>(code) = i; } -template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh, unsigned int ... prp> +template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh, unsigned int ... prp> __global__ void reorder_parts_wprp(int n, const vector_prp input, vector_prp output, @@ -454,12 +452,12 @@ __global__ void reorder_parts_wprp(int n, vector_pos output_pos, vector_ns sorted_non_sorted, vector_ns non_sorted_to_sorted, - const cnt_type * cells) + const vector_cells_type cells) { cnt_type i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n) return; - cnt_type code = cells[i]; + cnt_type code = cells.template get<0>(i); reorder_wprp<vector_prp,cnt_type,prp...>(input, output, code,i); reorder(input_pos,output_pos,code,i); diff --git a/src/Space/Shape/Point.hpp b/src/Space/Shape/Point.hpp index 91f62252..6e49d0e9 100644 --- a/src/Space/Shape/Point.hpp +++ b/src/Space/Shape/Point.hpp @@ -416,7 +416,7 @@ template<unsigned int dim ,typename T> class Point * \return the reference * */ - T & value(size_t i) + __device__ __host__ T & value(size_t i) { return get(i); } diff --git a/src/SparseGridGpu/performance/performancePlots.cpp b/src/SparseGridGpu/performance/performancePlots.cpp index 01a70924..3cd3b0e4 100644 --- a/src/SparseGridGpu/performance/performancePlots.cpp +++ b/src/SparseGridGpu/performance/performancePlots.cpp @@ -45,6 +45,8 @@ void write_test_report(report_sparse_grid_tests &report_sparsegrid_funcs, std::s plotGetSingle2D(report_sparsegrid_funcs, testSet, plotCounter); plotGetNeighbourhood2D(report_sparsegrid_funcs, testSet, plotCounter); + + std::string file_xml_results(test_dir); file_xml_results += std::string("/") + std::string(perfResultsXmlFile); @@ -310,7 +312,7 @@ void plotDense3D(report_sparse_grid_tests &report_sparsegrid_funcs, std::set<std base + ".gridScaling(#).GFlops.mean"); report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").x.data(0).source", base + ".gridScaling(#).gridSize.x"); - report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x", true); + report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x", true); int bes = static_cast<int>( report_sparsegrid_funcs.graphs.template get<double>( base + ".gridScaling(0).blockSize")); report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").y.data(0).title", @@ -330,7 +332,7 @@ void plotDense3D(report_sparse_grid_tests &report_sparsegrid_funcs, std::set<std base + ".blockScaling(#).GFlops.mean"); report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").x.data(0).source", base + ".blockScaling(#).blockSize"); - report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x",true); + report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x",true); int ges = static_cast<int>( report_sparsegrid_funcs.graphs.template get<double>( base + ".blockScaling(0).gridSize.x")); report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").y.data(0).title", diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh index ea202a7f..fef74515 100644 --- a/src/Vector/cuda/map_vector_cuda_ker.cuh +++ b/src/Vector/cuda/map_vector_cuda_ker.cuh @@ -172,6 +172,11 @@ namespace openfpm return v_size; } + __host__ __device__ size_t size_local() const + { + return size(); + } + /*! \brief return the maximum capacity of the vector before reallocation * * \return the capacity of the vector @@ -204,6 +209,38 @@ namespace openfpm return base.template get<p>(key); } + /*! \brief Get an element of the vector + * + * Get an element of the vector + * + * \tparam p Property to get + * \param id Element to get + * + * \return the element value requested + * + */ + template <unsigned int p> + __device__ __host__ inline auto getProp(unsigned int id) const -> decltype(base.template get<p>(grid_key_dx<1>(0))) + { + return this->get<p>(id); + } + + + /*! \brief Get an element of the vector + * + * Get an element of the vector + * + * \tparam p Property to get + * \param id Element to get + * + * \return the element value requested + * + */ + template <unsigned int p, typename key_type> + __device__ __host__ inline auto getProp(key_type id) const -> decltype(base.template get<p>(grid_key_dx<1>(0))) + { + return this->get<p>(id.getKey()); + } /*! \brief Get an element of the vector * @@ -488,7 +525,22 @@ namespace openfpm return base.getGPUIterator(start,stop,n_thr); } - + /*! \brief Get a domain iterator for the GPU + * + * + */ + ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const + { + return getGPUIterator(n_thr); + } + + //Stub for some expression + void init() const {} + + __host__ __device__ auto value(unsigned int p) -> decltype(base.template get<0>(grid_key_dx<1>(0))) + { + return get<0>(p); + } /*! \brief Get an iterator for the GPU * * @@ -501,6 +553,7 @@ namespace openfpm return base.getGPUIterator(start,stop_,n_thr); } + /*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers * * \param object to copy @@ -514,6 +567,16 @@ namespace openfpm return *this; } + __device__ __host__ vector_gpu_ker<T,layout_base> & getVector() + { + return *this; + } + + __device__ __host__ const vector_gpu_ker<T,layout_base> & getVector() const + { + return *this; + } + /*! \brief Return the base * * \return the base @@ -596,6 +659,11 @@ namespace openfpm return vref.size(); } + __host__ __device__ size_t size_local() const + { + return size(); + } + __device__ __host__ unsigned int capacity() const { return vref.capacity; @@ -692,6 +760,16 @@ namespace openfpm return vref.getGPUItertatorTo(stop,n_thr); } + vector_gpu_ker<T,layout_base> & getVector() + { + return *this; + } + + const vector_gpu_ker<T,layout_base> & getVector() const + { + return *this; + } + __host__ vector_gpu_ker_ref<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) { vref.operator=(v); diff --git a/src/Vector/cuda/map_vector_std_cuda.hpp b/src/Vector/cuda/map_vector_std_cuda.hpp index 319add7b..d384a51f 100644 --- a/src/Vector/cuda/map_vector_std_cuda.hpp +++ b/src/Vector/cuda/map_vector_std_cuda.hpp @@ -277,8 +277,11 @@ public: * */ vector(const std::initializer_list<T> & v) - :base(v) + // :base(v) { + // causes error if simply passed to base + for (T t: v) + add(t); } //! Constructor from another vector @@ -288,6 +291,16 @@ public: base.swap(v.base); } + //! Constructor from iterators + template< class InputIt > + vector(InputIt first, InputIt last) + { + while(first != last) { + add(*first); + ++first; + } + } + //! destructor ~vector() noexcept { diff --git a/src/Vector/cuda/map_vector_std_cuda_ker.cuh b/src/Vector/cuda/map_vector_std_cuda_ker.cuh index cb1b054f..ea583826 100644 --- a/src/Vector/cuda/map_vector_std_cuda_ker.cuh +++ b/src/Vector/cuda/map_vector_std_cuda_ker.cuh @@ -126,6 +126,27 @@ public: return base.template get<0>(id); } + /*! \brief Return the pointer to the chunk of memory + * + * \return the pointer to the chunk of memory + * + */ + __device__ __host__ void * getPointer() + { + return &base.template get<0>(0); + } + + /*! \brief Return the pointer to the chunk of memory + * + * \return the pointer to the chunk of memory + * + */ + __device__ __host__ const void * getPointer() const + { + return &base.template get<0>(0); + } + + vector_custd_ker() {} diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp index 953fdeab..078ffd17 100644 --- a/src/Vector/map_vector.hpp +++ b/src/Vector/map_vector.hpp @@ -336,7 +336,7 @@ namespace openfpm * * \return the size * - */ + */ //remove host device size_t size_local() const { return v_size; @@ -1359,7 +1359,21 @@ namespace openfpm return base.get_o(key); } - + /*! \brief Get an element of the vector + * + * Get an element of the vector + * + * \tparam p Property to get + * \param id Element to get + * + * \return the element value requested + * + */ //remove device host + template <unsigned int p> + inline auto getProp(const unsigned int & id) -> decltype(base.template get<p>(grid_key_dx<1>(0))) + { //uncomment this + return this->template get<p>(id); + } /*! \brief Get an element of the vector * * Get an element of the vector @@ -1369,12 +1383,12 @@ namespace openfpm * * \return the element value requested * - */ - - template <unsigned int p,typename KeyType> - inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0))) + *///remove host device + template <unsigned int p,typename KeyType> + inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0))) { - return this->template get<p>(id.getKey()); + //uncomment this + return this->template get<p>(id.getKey()); } /*! \brief Get an element of the vector @@ -1386,10 +1400,10 @@ namespace openfpm * * \return the element value requested * - */ + */ //remove device host template <unsigned int p, typename keyType> - inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0))) - { + inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0))) + { //uncomment this return this->template get<p>(id.getKey()); } @@ -1930,6 +1944,7 @@ namespace openfpm return base.getGPUIterator(start,stop_,n_thr); } + #endif /*! \brief Get the vector elements iterator @@ -1987,6 +2002,14 @@ namespace openfpm return base.getGPUIterator(start,stop,n_thr); } + /*! \brief Get a domain iterator for the GPU + * + * + */ + ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const + { + return getGPUIterator(n_thr); + } #endif /*! \brief Return the size of the message needed to pack this object diff --git a/src/Vector/map_vector_std.hpp b/src/Vector/map_vector_std.hpp index 2cf717e2..ae0c08f1 100644 --- a/src/Vector/map_vector_std.hpp +++ b/src/Vector/map_vector_std.hpp @@ -644,6 +644,16 @@ public: base.swap(v.base); } + //! Constructor from iterators + template< class InputIt > + vector(InputIt first, InputIt last) + { + while(first != last) { + add(*first); + ++first; + } + } + //! destructor ~vector() noexcept { diff --git a/src/Vector/vector_subset.hpp b/src/Vector/vector_subset.hpp new file mode 100644 index 00000000..d6d78b69 --- /dev/null +++ b/src/Vector/vector_subset.hpp @@ -0,0 +1,67 @@ +#ifndef VECTOR_SUBSET_HPP +#define VECTOR_SUBSET_HPP + +namespace openfpm +{ + +template<unsigned int dim, + typename prop, + template<typename> class layout_base = memory_traits_inte> + class vector_subset_ker + { + mutable openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> v_all; + + mutable openfpm::vector_gpu_ker<aggregate<int>,layout_base> indexes; + + public: + + vector_subset_ker(openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> & v_all, + openfpm::vector_gpu_ker<aggregate<int>,layout_base> & indexes) + :v_all(v_all),indexes(indexes) + {} + + // get the + + template <unsigned int p> + __device__ __host__ inline auto get(size_t id) const -> decltype(v_all.template get<p>(0)) + { + return v_all.template get<p>(indexes.template get<0>(id)); + } + } + +public: + + template<typename T, typename Memory, template<typename> class layout_base, typename grow_p, unsigned int impl> + class vector_sub + { + vector<T,Memory,layout_base,grow_p,impl> & v_all; + + vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes; + + public: + + vector(vector<T,Memory,layout_base,grow_p,impl> & v_all, + vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes) + :v_all(v_all),indexes(indexes) + {} + + + // To kernel + template<unsigned int ... prp> vector_dist_ker<dim,St,prop,layout_base> toKernel() + { + vector_subset_ker<dim,St,prop,layout_base> v(v_all.toKernel(), indexes.toKernel()); + + return v; + } + + template <unsigned int p> + inline auto get(size_t id) const -> decltype(v_all.template get<p>(0)) + { + return v_all.template get<p>(indexes.template get<0>(id)); + } + + } + +}; + +#endif \ No newline at end of file diff --git a/src/util/mathutil.hpp b/src/util/mathutil.hpp index 1b05785a..e813f30b 100644 --- a/src/util/mathutil.hpp +++ b/src/util/mathutil.hpp @@ -19,7 +19,7 @@ namespace openfpm * */ template <typename T> - int sgn(T val) + __host__ __device__ int sgn(T val) { return (T(0) < val) - (val < T(0)); } @@ -35,7 +35,8 @@ namespace openfpm * \return the factorial * */ - static inline size_t factorial(size_t f) + __host__ __device__ static inline + size_t factorial(size_t f) { size_t fc = 1; @@ -60,7 +61,8 @@ namespace openfpm * \param k * */ - static inline size_t C(size_t n, size_t k) + __host__ __device__ static inline + size_t C(size_t n, size_t k) { return factorial(n)/(factorial(k)*factorial(n-k)); } @@ -75,7 +77,8 @@ namespace openfpm * * */ - static inline size_t round_big_2(size_t n) + __host__ __device__ static inline + size_t round_big_2(size_t n) { n--; n |= n >> 1; // Divide by 2^k for consecutive doublings of k up to 32, @@ -102,14 +105,15 @@ namespace openfpm */ template<class T> - inline constexpr size_t pow(const T base, unsigned const exponent) + __host__ __device__ inline constexpr + size_t pow(const T base, unsigned const exponent) { // (parentheses not required in next line) return (exponent == 0) ? 1 : (base * pow(base, exponent-1)); } template<class T> - double intpowlog(const T x, unsigned const e) + __host__ __device__ double intpowlog(const T x, unsigned const e) { if (e == 0) return 1.0; if (e % 2 == 0) @@ -135,7 +139,8 @@ namespace openfpm * \param n modulo * */ - static inline long int positive_modulo(long int i, long int n) + __host__ __device__ static inline + long int positive_modulo(long int i, long int n) { return (i % n + n) % n; } @@ -155,7 +160,9 @@ namespace openfpm * \return the bound number * */ - template<typename T> static inline T periodic(const T & pos, const T & p2, const T & p1) + template<typename T> + __host__ __device__ static inline + T periodic(const T & pos, const T & p2, const T & p1) { T pos_tmp; @@ -182,7 +189,9 @@ namespace openfpm * \return the bound number * */ - template<typename T> __device__ __host__ static inline T periodic_l(const T & pos, const T & p2, const T & p1) + template<typename T> + __device__ __host__ static inline + T periodic_l(const T & pos, const T & p2, const T & p1) { T pos_tmp = pos; @@ -207,7 +216,7 @@ namespace openfpm * * */ - inline long int size_t_floor(double x) + __host__ __device__ inline long int size_t_floor(double x) { size_t i = (long int)x; /* truncate */ return i - ( i > x ); /* convert trunc to floor */ @@ -217,7 +226,7 @@ namespace openfpm * * */ - inline long int size_t_floor(float x) + __host__ __device__ inline long int size_t_floor(float x) { size_t i = (long int)x; /* truncate */ return i - ( i > x ); /* convert trunc to floor */ @@ -258,7 +267,7 @@ namespace openfpm * \param value * */ - inline int log2_64 (uint64_t value) + __host__ __device__ inline int log2_64 (uint64_t value) { value |= value >> 1; value |= value >> 2; @@ -276,7 +285,7 @@ namespace openfpm * * */ - inline long int size_t_floor(boost::multiprecision::float128 x) + __host__ __device__ inline long int size_t_floor(boost::multiprecision::float128 x) { size_t i = (long int)x; /* truncate */ return i - ( i > x ); /* convert trunc to floor */ -- GitLab