From 8a5e9361f60418e9c13e5f3f8fad70842fc17c8c Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Tue, 27 Nov 2018 19:22:09 +0100
Subject: [PATCH] Fixing GPU local ghost

---
 .../cuda/vector_dist_comm_util_funcs.cuh      |  7 ++++
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    |  8 ++++
 src/Vector/vector_dist_comm.hpp               | 40 ++++++++++++-------
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
index 14b81d7b9..841232874 100644
--- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
@@ -79,6 +79,10 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
 
 			auto ite = v_pos.getGPUIterator();
 
+			// no work to do return
+			if (ite.wthr.x == 0)
+			{return;}
+
 			// First we have to see how many entry each particle produce
 			num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
 			<<<ite.wthr,ite.thr>>>
@@ -263,6 +267,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
 		// Label the internal (assigned) particles
 		ite = v_pos.getGPUIteratorTo(g_m);
 
+		// resize o_part_loc
+		o_part_loc.resize(total);
+
 		shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
 									 decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
 									 decltype(starts.toKernel()),decltype(shifts.toKernel()),
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index 658f6002e..74b42c0a8 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -249,9 +249,17 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
     		for (unsigned int j = 0 ; j < dim ; j++)
     		{
     			v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
+    		}
+
+    		if (base_o + n < output.size())
+    		{
     			output.template get<0>(base_o+n) = p;
     			output.template get<1>(base_o+n) = shift_actual;
     		}
+    		else
+    		{
+    			printf("OVERFLOW \n");
+    		}
 
     		v_prp.set(base+n,v_prp.get(p));
 
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index f6078f4e6..3f5a9c464 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -948,12 +948,16 @@ class vector_dist_comm
 			starts.template deviceToHost<0>();
 			size_t offset = starts.template get<0>(rank);
 
-			// fill v_pos_tmp and v_prp_tmp with local particles
-			process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
+			// no work to do
+			if (ite.wthr.x != 0)
+			{
+				// fill v_pos_tmp and v_prp_tmp with local particles
+				process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
 					                                           decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
-			<<<ite.wthr,ite.thr>>>
-			(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
+				<<<ite.wthr,ite.thr>>>
+				(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
 					            v_pos.toKernel(),v_prp.toKernel(),offset);
+			}
 
 			// Fill the sending buffers
 			for (size_t i = 0 ; i < m_pos.size() ; i++)
@@ -962,11 +966,17 @@ class vector_dist_comm
 
 				auto ite = m_pos.get(i).getGPUIterator();
 
-				process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
+				// no work to do
+				if (ite.wthr.x != 0)
+				{
+
+					process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
 						                                           decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
-				<<<ite.wthr,ite.thr>>>
-				(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
+					<<<ite.wthr,ite.thr>>>
+					(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
 						            v_pos.toKernel(),v_prp.toKernel(),offset);
+
+				}
 			}
 
 			// old local particles with the actual local particles
@@ -1075,11 +1085,17 @@ class vector_dist_comm
 
 			prc_sz.template fill<0>(0);
 
+			auto ite = v_pos.getGPUIterator();
+			if (ite.wthr.x == 0)
+			{
+				starts.resize(v_cl.size());
+				starts.template fill<0>(0);
+				return;
+			}
+
 			// we have one process we can skip ...
 			if (v_cl.size() == 1)
 			{
-				auto ite = v_pos.getGPUIterator();
-
 				// ... but we have to apply the boundary conditions
 
 				periodicity_int<dim> bc;
@@ -1091,8 +1107,6 @@ class vector_dist_comm
 				return;
 			}
 
-			auto ite = v_pos.getGPUIterator();
-
 			// label particle processor
 			process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>
 			<<<ite.wthr,ite.thr>>>
@@ -1220,10 +1234,6 @@ class vector_dist_comm
 			                 size_t & g_m,
 			                 size_t opt)
 	{
-#ifdef EXTREA_TRACE_PRE_COMM
-		Extrae_user_function (1);
-#endif
-
 		// Buffer that contain for each processor the id of the particle to send
 		prc_sz.clear();
 		g_opart.clear();
-- 
GitLab