From 81c07b29155a72a27a915f193d9014e8bfa3ef54 Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Thu, 25 Nov 2021 17:34:56 +0100 Subject: [PATCH] Removing garbage + fixing test for CUDA on OpenMP --- .../Vector/3_molecular_dynamic_gpu/main.cu | 8 +-- .../3_molecular_dynamic_gpu_opt/main_gpu.cu | 8 +-- example/Vector/7_SPH_dlb_gpu_more_opt/main.cu | 4 +- src/Vector/vector_dist_comm.hpp | 67 ------------------- 4 files changed, 10 insertions(+), 77 deletions(-) diff --git a/example/Vector/3_molecular_dynamic_gpu/main.cu b/example/Vector/3_molecular_dynamic_gpu/main.cu index f0b28e427..10d9a6331 100644 --- a/example/Vector/3_molecular_dynamic_gpu/main.cu +++ b/example/Vector/3_molecular_dynamic_gpu/main.cu @@ -235,7 +235,7 @@ template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggr // Get an iterator over particles auto it2 = vd.getDomainIteratorGPU(); - calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2); + CUDA_LAUNCH(calc_force_gpu,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2); } //! \cond [calc_forces] \endcond @@ -249,7 +249,7 @@ template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_numbe auto it2 = vd.getDomainIteratorGPU(); - particle_energy<<<it2.wthr,it2.thr>>>(vd.toKernel(),NN.toKernel(),sigma12,sigma6,shift,r_cut2); + CUDA_LAUNCH(particle_energy,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,shift,r_cut2); //! \cond [calc_energy_red] \endcond @@ -343,7 +343,7 @@ int main(int argc, char* argv[]) // Get the iterator auto it3 = vd.getDomainIteratorGPU(); - update_velocity_position<<<it3.wthr,it3.thr>>>(vd.toKernel(),dt); + CUDA_LAUNCH(update_velocity_position,it3,vd.toKernel(),dt); //! \cond [run_on_device] \endcond @@ -360,7 +360,7 @@ int main(int argc, char* argv[]) // Integrate the velocity Step 3 auto it4 = vd.getDomainIteratorGPU(); - update_velocity<<<it4.wthr,it4.thr>>>(vd.toKernel(),dt); + CUDA_LAUNCH(update_velocity,it4,vd.toKernel(),dt); // After every iteration collect some statistic about the configuration if (i % 100 == 0) diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu index 2d22c56cc..8d20134e5 100644 --- a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu +++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu @@ -262,7 +262,7 @@ template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggr //! \cond [calc_force_sorted] \endcond - calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2); + CUDA_LAUNCH(calc_force_gpu,it2.wthr,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2); //! \cond [calc_force_sorted] \endcond @@ -282,7 +282,7 @@ template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_numbe auto it2 = vd.getDomainIteratorGPU(); - particle_energy<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2); + CUDA_LAUNCH(particle_energy,it2,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2); vd.merge_sort<energy>(NN); @@ -378,7 +378,7 @@ int main(int argc, char* argv[]) // Get the iterator auto it3 = vd.getDomainIteratorGPU(); - update_velocity_position<<<it3.wthr,it3.thr>>>(vd.toKernel(),dt); + CUDA_LAUNCH(update_velocity_position,it3,vd.toKernel(),dt); // Because we moved the particles in space we have to map them and re-sync the ghost vd.map(RUN_ON_DEVICE); @@ -390,7 +390,7 @@ int main(int argc, char* argv[]) // Integrate the velocity Step 3 auto it4 = vd.getDomainIteratorGPU(); - update_velocity<<<it4.wthr,it4.thr>>>(vd.toKernel(),dt); + CUDA_LAUNCH(update_velocity,it4,vd.toKernel(),dt); // After every iteration collect some statistic about the configuration if (i % 1000 == 0) diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu index 453530190..7e7eab9d9 100644 --- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu @@ -531,7 +531,7 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n // Calculate the maximum acceleration auto part = vd.getDomainIteratorGPU(); - max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel()); + CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel()); max_acc = reduce_local<red,_max_>(vd); max_vel = reduce_local<red2,_max_>(vd); @@ -714,7 +714,7 @@ void euler_int(particles & vd, real_number dt) real_number dt205 = dt*dt*0.5; - euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205); + CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205); // remove the particles remove_marked<red>(vd); diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp index f31f4538a..8dc777dba 100644 --- a/src/Vector/vector_dist_comm.hpp +++ b/src/Vector/vector_dist_comm.hpp @@ -1269,42 +1269,6 @@ class vector_dist_comm // The first part of m_opart and prc_sz contain the local particles - #ifndef TEST1 - - v_pos_tmp.resize(prc_sz.template get<0>(0)); - v_prp_tmp.resize(prc_sz.template get<0>(0)); - - auto ite = v_pos_tmp.getGPUIterator(); - - // fill v_pos_tmp and v_prp_tmp with local particles - process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), - decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> - <<<ite.wthr,ite.thr>>> - (m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), - v_pos.toKernel(),v_prp.toKernel(),0); - - size_t offset = prc_sz.template get<0>(0); - - // Fill the sending buffers - for (size_t i = 0 ; i < m_pos.size() ; i++) - { - auto ite = m_pos.get(i).getGPUIterator(); - - process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), - decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> - <<<ite.wthr,ite.thr>>> - (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), - v_pos.toKernel(),v_prp.toKernel(),offset); - - offset += prc_sz_r.size(); - } - - // old local particles with the actual local particles - v_pos_tmp.swap(v_pos); - v_prp_tmp.swap(v_prp); - - #else - int rank = v_cl.rank(); v_pos_tmp.resize(prc_sz.template get<0>(rank)); @@ -1350,7 +1314,6 @@ class vector_dist_comm v_pos_tmp.swap(v_pos); v_prp_tmp.swap(v_prp); - #endif #else std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl; @@ -1478,35 +1441,6 @@ class vector_dist_comm ite, dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank()); - - #ifndef TEST1 - - // sort particles - mergesort((int *)lbl_p.template getDeviceBuffer<1>(),(int *)lbl_p.template getDeviceBuffer<0>(), lbl_p.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); - - mem.allocate(sizeof(int)); - mem.fill(0); - - // Find the buffer bases - find_buffer_offsets<1,decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>> - (lbl_p.toKernel(),(int *)mem.getDevicePointer(),prc_sz.toKernel()); - -#error "should not be here" - - // Trasfer the number of offsets on CPU - mem.deviceToHost(); - prc_sz.template deviceToHost<0,1>(); - // get also the last element from lbl_p; - lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1); - - mem.deviceToHost(); - int noff = *(int *)mem.getPointer(); - prc_sz.resize(noff+1); - prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size(); - prc_sz.template get<1>(prc_sz.size()-1) = lbl_p.template get<1>(lbl_p.size()-1); - - #else - starts.resize(v_cl.size()); openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); @@ -1518,7 +1452,6 @@ class vector_dist_comm // we order lbl_p CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel()); - #endif #else -- GitLab