From 635c72dee226ba2acf97cd2f3bbfeb1e875e05e7 Mon Sep 17 00:00:00 2001 From: Pietro Incardona <incardon@mpi-cbg.de> Date: Sat, 2 Nov 2019 20:03:32 +0100 Subject: [PATCH] Adding funcionalities + speed up some others --- example/Vector/7_SPH_dlb_gpu/main.cu | 2 +- example/Vector/7_SPH_dlb_gpu_opt/main.cu | 2 +- openfpm_data | 2 +- src/Vector/cuda/vector_dist_cuda_funcs.cuh | 104 ++++++++++-------- src/Vector/cuda/vector_dist_gpu_unit_tests.cu | 80 +++++--------- src/Vector/vector_dist.hpp | 36 ++++-- src/Vector/vector_dist_kernel.hpp | 4 + 7 files changed, 116 insertions(+), 114 deletions(-) diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu index fbf069a98..08d91fbe2 100644 --- a/example/Vector/7_SPH_dlb_gpu/main.cu +++ b/example/Vector/7_SPH_dlb_gpu/main.cu @@ -91,7 +91,7 @@ const real_number MassBound = 0.000614125; #ifdef TEST_RUN const real_number t_end = 0.001; #else -const real_number t_end = 0.003; +const real_number t_end = 1.5; #endif // Gravity acceleration diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu index fa6c0ea2a..74b6f3c32 100644 --- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu @@ -99,7 +99,7 @@ const real_number MassBound = 0.0000767656; #ifdef TEST_RUN const real_number t_end = 0.001; #else -const real_number t_end = 1.5; +const real_number t_end = 0.003; #endif // Gravity acceleration diff --git a/openfpm_data b/openfpm_data index f8ea1b875..a83fa7208 160000 --- a/openfpm_data +++ b/openfpm_data @@ -1 +1 @@ -Subproject commit f8ea1b875c24392c1a6991f4faa000e2981e96d1 +Subproject commit a83fa72085b240324d58ee5583d53ca241a608a8 diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh index e986746e5..b42cb9351 100644 --- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh +++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh @@ -298,8 +298,8 @@ __global__ void create_index(vector_type vd) vd.template get<0>(i) = i; } -template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename scan_type> -__global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type idx) +template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename ids_type> +__global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx) { int i = threadIdx.x + blockIdx.x * blockDim.x; @@ -311,6 +311,34 @@ __global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_p vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i)); } +template<unsigned int dim, unsigned int prp, typename vector_pos_type, typename vector_prp_type, typename scan_type> +__global__ void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + + if (i >= scan.size()) return; + + auto sc = scan.template get<0>(i); + + if (vd_prp_src.template get<prp>(i) == 0) return; + + for (unsigned int k = 0 ; k < dim ; k++) + {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];} + + vd_prp_dst.set(sc,vd_prp_src,i); +} + + +template<unsigned int prp,typename vector_type> +__global__ void flip_one_to_zero(vector_type vd) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + + if (i >= vd.size_local()) return; + + vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0); +} + /*! \brief Remove the particles marked on the properties prp (particles marked has has property set to 1, the others to 0) * * \warning the function is destructive on prp, it mean that after destruction the prp of the particles can contain garbage @@ -336,65 +364,49 @@ void remove_marked(vector_type & vd) typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type; - // first we do a scan of the property - openfpm::vector_gpu<aggregate<unsigned int>> idx; + // because we mark the one to remove we flip the one to zero and the zeros to one - idx.setMemory(mem_tmp); - idx.resize(vd.size_local()); + auto ite = vd.getDomainIteratorGPU(); - auto ite = idx.getGPUIterator(); + CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel()); - CUDA_LAUNCH(create_index,ite,idx.toKernel()); + // first we scan - // sort particles, so the particles to remove stay at the end - mergesort((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),(unsigned int *)idx.template getDeviceBuffer<0>(), idx.size(), mgpu::template less_t<remove_type>(), vd.getVC().getmgpuContext()); + openfpm::vector_gpu<aggregate<remove_type>> idx; - openfpm::vector_gpu<aggregate<int>> mark; - mark.resize(1); + idx.setMemory(mem_tmp); + idx.resize(vd.size_local()); - CudaMemory mem; - mem.allocate(sizeof(int)); - mem.fill(0); + openfpm::scan((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),vd.size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.getVC().getmgpuContext()); - // mark point, particle that stay and to remove - CUDA_LAUNCH((find_buffer_offsets_no_prc<prp,decltype(vd.getPropVector().toKernel()),decltype(mark.toKernel())>),ite, - vd.getPropVector().toKernel(),(int *)mem.getDevicePointer(),mark.toKernel(),vd.size_local()); + // Check if we marked something - mem.deviceToHost(); + idx.template deviceToHost<0>(idx.size()-1,idx.size()-1); + vd.template deviceToHostProp<prp>(vd.size_local()-1,vd.size_local()-1); - // we have no particles to remove - if (*(int *)mem.getPointer() != 1) + int n_marked = vd.size_local() - (vd.template getProp<prp>(vd.size_local()-1) + idx.template get<0>(idx.size()-1)); + + if (n_marked == 0) { - if (*(int *)mem.getPointer() >= 2) - { - std::cout << __FILE__ << ":" << __LINE__ << " error: removing marked particle. Carefull particle must be marked with 1 or 0, no other numbers" << std::endl; - } + // No particle has been marked Nothing to do + return; } - // Get the mark point - mark.template deviceToHost<0>(); - - // than create an equivalent buffer prop and pos + ///////////////////////////////// typename std::remove_reference<decltype(vd.getPosVector())>::type vd_pos_new; typename std::remove_reference<decltype(vd.getPropVector())>::type vd_prp_new; // resize them - vd_pos_new.resize(mark.template get<0>(0)); - vd_prp_new.resize(mark.template get<0>(0)); + vd_pos_new.resize(vd.size_local() - n_marked); + vd_prp_new.resize(vd.size_local() - n_marked); auto & vd_pos_old = vd.getPosVector(); auto & vd_prp_old = vd.getPropVector(); - // now we copy from the old vector to the new one - - ite = vd_pos_old.getGPUIterator(); - - CUDA_LAUNCH((copy_new_to_old<vector_type::dims>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel()); - - // and we swap + CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel()); vd.set_g_m(vd_pos_new.size()); @@ -405,9 +417,11 @@ void remove_marked(vector_type & vd) template<unsigned int prp, typename functor, typename particles_type, typename out_type> __global__ void mark_indexes(particles_type vd, out_type out) { - auto a = GET_PARTICLE(vd); + unsigned int p = threadIdx.x + blockIdx.x * blockDim.x; + + if (p >= vd.size()) {return;} - out.template getProp<0>(a) = functor::check(vd.template getProp<prp>(a)) == true; + out.template get<0>(p) = functor::check(vd.template get<prp>(p)) == true; } template<typename out_type, typename ids_type> @@ -435,20 +449,20 @@ __global__ void fill_indexes(out_type scan, ids_type ids) * \param vd distributed vector * */ -template<typename functor, typename vector_type, typename ids_type> -void get_indexes_sorted(vector_type & vd, ids_type & ids, mgpu::ofp_context_t & context) +template<unsigned int prp, typename functor, typename vector_type, typename ids_type> +void get_indexes_by_type(vector_type & vd, ids_type & ids, mgpu::ofp_context_t & context) { // first we do a scan of the property openfpm::vector_gpu<aggregate<unsigned int>> scan; scan.setMemory(mem_tmp); - scan.resize(vd.size_local_with_ghost()+1); + scan.resize(vd.size()+1); auto ite = scan.getGPUIterator(); - CUDA_LAUNCH(mark_indexes,ite,vd.toKernel(),scan.toKernel()); + CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel()); - openfpm::scan(scan.template getDeviceBuffer<0>(),scan.size(),scan.template getDeviceBuffer<0>(),context); + openfpm::scan((unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(unsigned int *)scan.template getDeviceBuffer<0>(),context); // get the number of marked particles scan.template deviceToHost<0>(scan.size()-1,scan.size()-1); diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu index daa90b634..56994bb9a 100644 --- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu +++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu @@ -1488,6 +1488,14 @@ BOOST_AUTO_TEST_CASE(vector_dist_keep_prop_on_cuda) } } +struct type_is_one +{ + __device__ static bool check(int c) + { + return c == 1; + } +}; + BOOST_AUTO_TEST_CASE(vector_dist_get_index_set) { Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0}); @@ -1497,6 +1505,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_get_index_set) if (create_vcluster().size() >= 16) {return;} + Vcluster<> & v_cl = create_vcluster(); + vector_dist_gpu<3,double,aggregate<int,double>> vdg(10000,domain,bc,g,DEC_GRAN(128)); auto it = vdg.getDomainIterator(); @@ -1521,70 +1531,30 @@ BOOST_AUTO_TEST_CASE(vector_dist_get_index_set) vdg.hostToDeviceProp<0,1>(); vdg.hostToDevicePos(); -/* bool test = vdg.compareHostAndDevicePos(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); - - vdg.getPos(100)[0] = 0.99999999; - - test = vdg.compareHostAndDevicePos(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,false); - - vdg.hostToDevicePos(); - vdg.getPos(100)[0] = 0.99999999; - - test = vdg.compareHostAndDevicePos(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); - - ////////////////////////////////////////////////// PROP VECTOR + auto cl = vdg.getCellListGPU(0.1); - test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); - - vdg.getProp<1>(103)[0] = 0.99999999; + // than we get a cell-list to force reorder - test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,false); + openfpm::vector_gpu<aggregate<unsigned int>> ids; - vdg.hostToDeviceProp<1>(); - vdg.getProp<1>(103)[0] = 0.99999999; + get_indexes_by_type<0,type_is_one>(vdg.getPropVectorSort(),ids,v_cl.getmgpuContext()); - test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); + // test - ////////////////////////////////////////////////// PROP scalar + ids.template deviceToHost<0>(); + auto & vs = vdg.getPropVectorSort(); + vs.template deviceToHost<0>(); - test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); - - vdg.getProp<0>(105) = 0.99999999; - - test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,false); - - vdg.hostToDeviceProp<0>(); - vdg.getProp<0>(105) = 0.99999999; - - test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); - - - ////////////////////////////////////////////////// PROP scalar - - - test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true); - - vdg.getProp<2>(108)[1][2] = 0.99999999; - - test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,false); + bool match = true; - vdg.hostToDeviceProp<2>(); - vdg.getProp<2>(108)[1][2] = 0.99999999; + for (int i = 0 ; i < ids.size() ; i++) + { + if (vs.template get<0>(ids.template get<0>(i)) != 1) + {match = false;} + } - test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001); - BOOST_REQUIRE_EQUAL(test,true);*/ + BOOST_REQUIRE_EQUAL(match,true); } BOOST_AUTO_TEST_CASE(vector_dist_compare_host_device) diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp index 73c6c4f79..d69adf2a8 100644 --- a/src/Vector/vector_dist.hpp +++ b/src/Vector/vector_dist.hpp @@ -74,7 +74,7 @@ constexpr int GCL_NON_SYMMETRIC = 0; constexpr int GCL_SYMMETRIC = 1; constexpr int GCL_HILBERT = 2; -template<bool is_gpu_celllist> +template<bool is_gpu_celllist, unsigned int ... prp> struct gcl_standard_no_symmetric_impl { template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl> @@ -84,18 +84,18 @@ struct gcl_standard_no_symmetric_impl } }; -template<> -struct gcl_standard_no_symmetric_impl<true> +template<unsigned int ... prp> +struct gcl_standard_no_symmetric_impl<true,prp...> { template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl> static inline CellL get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g) { - return vd.template getCellListGPU<CellL>(r_cut); + return vd.template getCellListGPU<CellL,prp...>(r_cut); } }; //! General function t get a cell-list -template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl> +template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl, unsigned int ... prp> struct gcl { /*! \brief Get the Cell list based on the type @@ -109,7 +109,7 @@ struct gcl */ static inline CellL get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g) { - return gcl_standard_no_symmetric_impl<is_gpu_celllist<CellL>::value>::template get<dim,St,CellL,Vector,impl>(vd,r_cut,g); + return gcl_standard_no_symmetric_impl<is_gpu_celllist<CellL>::value,prp ...>::template get<dim,St,CellL,Vector,impl>(vd,r_cut,g); } }; @@ -1209,7 +1209,7 @@ public: * \return the Cell list * */ - template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>> + template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>,unsigned int ... prp> CellType getCellListGPU(St r_cut, bool no_se3 = false) { #ifdef SE_CLASS3 @@ -1243,7 +1243,7 @@ public: * \return the CellList * */ - template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>> + template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>, unsigned int ... prp> CellType getCellListGPU(St r_cut, const Ghost<dim, St> & enlarge, bool no_se3 = false) { #ifdef SE_CLASS3 @@ -1265,7 +1265,7 @@ public: v_prp_out.resize(v_pos.size()); v_pos_out.resize(v_pos.size()); - cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m); + cell_list.template construct<decltype(v_pos),decltype(v_prp),prp ...>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m); cell_list.set_ndec(getDecomposition().get_ndec()); cell_list.set_gm(g_m); @@ -1349,7 +1349,8 @@ public: * \param no_se3 avoid se class 3 checking * */ - template<typename CellL> void updateCellList(CellL & cell_list, bool no_se3 = false, cl_construct_opt opt = cl_construct_opt::Full) + template<unsigned int ... prp,typename CellL> + void updateCellList(CellL & cell_list, bool no_se3 = false, cl_construct_opt opt = cl_construct_opt::Full) { #ifdef SE_CLASS3 if (no_se3 == false) @@ -1366,7 +1367,7 @@ public: if (to_reconstruct == false) { - populate_cell_list(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,v_cl.getmgpuContext(false),g_m,CL_NON_SYMMETRIC,opt); + populate_cell_list<dim,St,prop,Memory,layout_base,CellL,prp ...>(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,v_cl.getmgpuContext(false),g_m,CL_NON_SYMMETRIC,opt); cell_list.set_gm(g_m); } @@ -2989,6 +2990,19 @@ public: v_prp.template deviceToHost<prp ...>(); } + /*! \brief Move the memory from the device to host memory + * + * \tparam property to move use POS_PROP for position property + * + * \param start point + * \param stop point (included) + * + */ + template<unsigned int ... prp> void deviceToHostProp(size_t start, size_t stop) + { + v_prp.template deviceToHost<prp ...>(start,stop); + } + /*! \brief Move the memory from the device to host memory * * \tparam property to move use POS_PROP for position property diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp index 7c99a50da..4e825223c 100644 --- a/src/Vector/vector_dist_kernel.hpp +++ b/src/Vector/vector_dist_kernel.hpp @@ -16,6 +16,10 @@ #define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\ else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);} + +#define GET_PARTICLE_BY_ID(p,ids) if (blockDim.x*blockIdx.x + threadIdx.x >= ids.size()) {return;}\ + else{p = ids.template get<0>(blockDim.x*blockIdx.x + threadIdx.x);} + /*! \brief this class is a functor for "for_each" algorithm * * This class is a functor for "for_each" algorithm. For each -- GitLab