From 8a5e9361f60418e9c13e5f3f8fad70842fc17c8c Mon Sep 17 00:00:00 2001 From: Pietro Incardona <incardon@mpi-cbg.de> Date: Tue, 27 Nov 2018 19:22:09 +0100 Subject: [PATCH] Fixing GPU local ghost --- .../cuda/vector_dist_comm_util_funcs.cuh | 7 ++++ src/Vector/cuda/vector_dist_cuda_funcs.cuh | 8 ++++ src/Vector/vector_dist_comm.hpp | 40 ++++++++++++------- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh index 14b81d7b9..841232874 100644 --- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh +++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh @@ -79,6 +79,10 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca auto ite = v_pos.getGPUIterator(); + // no work to do return + if (ite.wthr.x == 0) + {return;} + // First we have to see how many entry each particle produce num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())> <<<ite.wthr,ite.thr>>> @@ -263,6 +267,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true> // Label the internal (assigned) particles ite = v_pos.getGPUIteratorTo(g_m); + // resize o_part_loc + o_part_loc.resize(total); + shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()), decltype(v_pos.toKernel()),decltype(v_prp.toKernel()), decltype(starts.toKernel()),decltype(shifts.toKernel()), diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh index 658f6002e..74b42c0a8 100644 --- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh +++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh @@ -249,9 +249,17 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_ for (unsigned int j = 0 ; j < dim ; j++) { v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j]; + } + + if (base_o + n < output.size()) + { output.template get<0>(base_o+n) = p; output.template get<1>(base_o+n) = shift_actual; } + else + { + printf("OVERFLOW \n"); + } v_prp.set(base+n,v_prp.get(p)); diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp index f6078f4e6..3f5a9c464 100644 --- a/src/Vector/vector_dist_comm.hpp +++ b/src/Vector/vector_dist_comm.hpp @@ -948,12 +948,16 @@ class vector_dist_comm starts.template deviceToHost<0>(); size_t offset = starts.template get<0>(rank); - // fill v_pos_tmp and v_prp_tmp with local particles - process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), + // no work to do + if (ite.wthr.x != 0) + { + // fill v_pos_tmp and v_prp_tmp with local particles + process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()), decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> - <<<ite.wthr,ite.thr>>> - (m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), + <<<ite.wthr,ite.thr>>> + (m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(), v_pos.toKernel(),v_prp.toKernel(),offset); + } // Fill the sending buffers for (size_t i = 0 ; i < m_pos.size() ; i++) @@ -962,11 +966,17 @@ class vector_dist_comm auto ite = m_pos.get(i).getGPUIterator(); - process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), + // no work to do + if (ite.wthr.x != 0) + { + + process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> - <<<ite.wthr,ite.thr>>> - (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), + <<<ite.wthr,ite.thr>>> + (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), v_pos.toKernel(),v_prp.toKernel(),offset); + + } } // old local particles with the actual local particles @@ -1075,11 +1085,17 @@ class vector_dist_comm prc_sz.template fill<0>(0); + auto ite = v_pos.getGPUIterator(); + if (ite.wthr.x == 0) + { + starts.resize(v_cl.size()); + starts.template fill<0>(0); + return; + } + // we have one process we can skip ... if (v_cl.size() == 1) { - auto ite = v_pos.getGPUIterator(); - // ... but we have to apply the boundary conditions periodicity_int<dim> bc; @@ -1091,8 +1107,6 @@ class vector_dist_comm return; } - auto ite = v_pos.getGPUIterator(); - // label particle processor process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())> <<<ite.wthr,ite.thr>>> @@ -1220,10 +1234,6 @@ class vector_dist_comm size_t & g_m, size_t opt) { -#ifdef EXTREA_TRACE_PRE_COMM - Extrae_user_function (1); -#endif - // Buffer that contain for each processor the id of the particle to send prc_sz.clear(); g_opart.clear(); -- GitLab