From 1974b8bcb89eeb0d4b006eded77dfa712f312ec9 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Mon, 12 Nov 2018 19:00:43 +0100
Subject: [PATCH] Fixing GPU tests for pdata

---
 CMakeLists.txt                                |   1 +
 configure                                     |   5 +
 src/CMakeLists.txt                            |   4 +
 .../cuda/CartDecomposition_gpu.cuh            |   2 +-
 .../cuda/decomposition_cuda_tests.cu          |  10 +-
 src/Decomposition/ie_ghost.hpp                |  13 +-
 .../cuda/vector_dist_comm_util_funcs.cuh      |  30 +++--
 src/Vector/cuda/vector_dist_cuda_func_test.cu | 121 +++++++++---------
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    |  33 +++--
 src/Vector/cuda/vector_dist_gpu_unit_tests.cu |  36 +++---
 src/Vector/vector_dist.hpp                    |   1 +
 src/Vector/vector_dist_comm.hpp               |  48 +++++--
 src/Vector/vector_dist_kernel.hpp             |   2 +-
 13 files changed, 193 insertions(+), 113 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6d6967f..0c1b0193 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 project(openfpm_pdata LANGUAGES C CXX)
 
+enable_testing()
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
 
diff --git a/configure b/configure
index d04dd1f6..090620f0 100755
--- a/configure
+++ b/configure
@@ -215,6 +215,7 @@ do
     case $ac_useropt in
     debug)
        conf_options="$conf_options -DCMAKE_BUILD_TYPE=Debug"
+       debug_mode=1
        ;;
     se_class1)
        conf_options="$conf_options -DSE_CLASS1=ON"
@@ -524,6 +525,10 @@ Try \`$0 --help' for more information"
   esac
 done
 
+if [ x"$debug_mode" != x"1" ]; then
+	conf_options+="$conf_options -DCMAKE_BUILD_TYPE=Release"
+fi
+
 cd build
 
 ## remove enerything
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 41f96ed9..31317c79 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 
+
 ########################### Executables
 
 if(CUDA_FOUND)
@@ -16,6 +17,9 @@ endif()
 
 add_library(ofpm_pdata STATIC lib/pdata.cpp)
 
+add_test(NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata)
+add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata)
+
 ###########################
 
 if (CUDA_FOUND)
diff --git a/src/Decomposition/cuda/CartDecomposition_gpu.cuh b/src/Decomposition/cuda/CartDecomposition_gpu.cuh
index d986d1d7..de38fe76 100644
--- a/src/Decomposition/cuda/CartDecomposition_gpu.cuh
+++ b/src/Decomposition/cuda/CartDecomposition_gpu.cuh
@@ -105,7 +105,7 @@ public:
 	}
 
 	CartDecomposition_gpu(const CartDecomposition_gpu<dim,T,Memory,layout_base> & dec)
-	:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain)
+	:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain),sub_domains_global(dec.sub_domains_global)
 	{
 		for (int s = 0 ; s < dim ; s++)
 		{this->bc[s] = dec.bc[s];}
diff --git a/src/Decomposition/cuda/decomposition_cuda_tests.cu b/src/Decomposition/cuda/decomposition_cuda_tests.cu
index 2d40d636..81ae0b4a 100644
--- a/src/Decomposition/cuda/decomposition_cuda_tests.cu
+++ b/src/Decomposition/cuda/decomposition_cuda_tests.cu
@@ -41,7 +41,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 	// Vcluster
 	Vcluster<> & vcl = create_vcluster();
 
-	CartDecomposition<3, double> dec(vcl);
+	CartDecomposition<3, double, CudaMemory,memory_traits_inte> dec(vcl);
 
 	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
 
@@ -87,6 +87,8 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 			mem2.allocate(2*sizeof(unsigned int));
 			test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
 
+			mem2.deviceToHost();
+
 			unsigned int tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
 
 			openfpm::vector_gpu<aggregate<int,int>> vd;
@@ -119,12 +121,16 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 
 			test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
 
+			mem.deviceToHost();
+
 			BOOST_REQUIRE(((unsigned int *)mem.getPointer())[0] < vcl.size());
 			BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
 
 			mem2.allocate(2*sizeof(unsigned int));
 			test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
 
+			mem2.deviceToHost();
+
 			tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
 
 			vd.resize(tot);
@@ -132,7 +138,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 
 			if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
 			{
-				if (vcl.rank() == ((unsigned int *)mem.getPointer())[2])
+				if (vcl.rank() == ((unsigned int *)mem.getPointer())[1])
 				{
 					BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] != 0);
 					BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] == 0);
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index a6df7964..f78b40c8 100755
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -448,12 +448,17 @@ protected:
 		reorder_geo_cell();
 	}
 
-	/*! \brief in this function we reorder the cells by processors
+	/*! \brief in this function we reorder the list in each cells by processor id
 	 *
-	 * In practice every processor in the list is ordered. the geo_cell give
+	 * suppose in one cell we have 7 boxes each box contain the processor id
 	 *
-	 * 7 boxes the first 2 boxes are related to processor 0 and the next 2 to processor 4, the other 3 must me related
-	 * to another processor different from 0 and 4. This simplify the procedure to get a unique list of processor ids
+	 * 1,5,9,5,1,1,6
+	 *
+	 * after reorder we have the following sequence
+	 *
+	 * 1,1,1,5,5,6,9
+	 *
+	 * This simplify the procedure to get a unique list of processor ids
 	 * indicating on which processor a particle must be replicated as ghost
 	 *
 	 */
diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
index a5988379..14b81d7b 100644
--- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
@@ -108,29 +108,39 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
 			mem.fill(0);
 			prc_offset.resize(v_cl.size());
 
+			ite = g_opart_device.getGPUIterator();
+
 			// Find the buffer bases
 			find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>>
 					           (g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
 
 			// Trasfer the number of offsets on CPU
 			mem.deviceToHost();
-			prc_offset.template deviceToHost<0,1>();
-			if (g_opart_device.size() != 0)
-			{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
-
 			int noff = *(int *)mem.getPointer();
 
-			// In this case we do not have communications at all
-			if (g_opart_device.size() == 0)
-			{noff = -1;}
+			// create the terminal of prc_offset
+			prc_offset.resize(noff+1,DATA_ON_DEVICE);
 
-			prc_offset.resize(noff+1);
+			// Move the last processor index on device (id)
+			if (g_opart_device.size() != 0)
+			{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
 			prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
 			if (g_opart_device.size() != 0)
 			{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
 			else
 			{prc_offset.template get<1>(prc_offset.size()-1) = 0;}
 
+			prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
+
+			// Here we reorder the offsets in ascending order
+			mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
+
+			prc_offset.template deviceToHost<0,1>();
+
+			// In this case we do not have communications at all
+			if (g_opart_device.size() == 0)
+			{noff = -1;}
+
 			prc.resize(noff+1);
 			prc_sz.resize(noff+1);
 
@@ -236,9 +246,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
 		auto ite = v_pos.getGPUIteratorTo(g_m);
 
 		// label particle processor
-		num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
+		num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
 		<<<ite.wthr,ite.thr>>>
-		(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
+		(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
 
 		starts.resize(o_part_loc.size());
 		mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu
index 243d6fd0..b866e63e 100644
--- a/src/Vector/cuda/vector_dist_cuda_func_test.cu
+++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu
@@ -98,9 +98,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 	v_prp.hostToDevice<0,1,2>();
 
 	// label particle processor
-	num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
+	num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
 	<<<ite.wthr,ite.thr>>>
-	(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
+	(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size());
 
 	o_part_loc.deviceToHost<0>();
 
@@ -734,11 +734,16 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
 	proc_id_out.resize(vg.size());
 
 	openfpm::vector_gpu<aggregate<int,int,int>> dev_counter;
+	dev_counter.resize(10);
+	dev_counter.fill<0>(0);
+	dev_counter.fill<1>(0);
+	dev_counter.fill<2>(0);
 
 	process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>
 	<<<ite.wthr,ite.thr>>>
 	(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
 
+
 	proc_id_out.deviceToHost<0>();
 
 	bool match = true;
@@ -852,33 +857,33 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 
 	for (int i = 0 ; i < 10000 ; i++) // <------ particle id
 	{
-		v_pos.template get<0>(i)[0] = i;
-		v_pos.template get<0>(i)[1] = i+10000;
-		v_pos.template get<0>(i)[2] = i+20000;
+		v_pos_out.template get<0>(i)[0] = i;
+		v_pos_out.template get<0>(i)[1] = i+10000;
+		v_pos_out.template get<0>(i)[2] = i+20000;
 
-		v_prp.template get<0>(i)[0] = i+60123;
-		v_prp.template get<0>(i)[1] = i+73543;
-		v_prp.template get<0>(i)[2] = i+82432;
+		v_prp_out.template get<0>(i)[0] = i+60123;
+		v_prp_out.template get<0>(i)[1] = i+73543;
+		v_prp_out.template get<0>(i)[2] = i+82432;
 
-		v_prp.template get<1>(i)[0] = i+80123;
-		v_prp.template get<1>(i)[1] = i+93543;
-		v_prp.template get<1>(i)[2] = i+102432;
+		v_prp_out.template get<1>(i)[0] = i+80123;
+		v_prp_out.template get<1>(i)[1] = i+93543;
+		v_prp_out.template get<1>(i)[2] = i+102432;
 
-		v_prp.template get<2>(i)[0] = i+110123;
-		v_prp.template get<2>(i)[1] = i+123543;
-		v_prp.template get<2>(i)[2] = i+132432;
+		v_prp_out.template get<2>(i)[0] = i+110123;
+		v_prp_out.template get<2>(i)[1] = i+123543;
+		v_prp_out.template get<2>(i)[2] = i+132432;
 
-		v_prp_out.template get<0>(i)[0] = 0;
-		v_prp_out.template get<0>(i)[1] = 0;
-		v_prp_out.template get<0>(i)[2] = 0;
+		v_prp.template get<0>(i)[0] = 0;
+		v_prp.template get<0>(i)[1] = 0;
+		v_prp.template get<0>(i)[2] = 0;
 
-		v_prp_out.template get<1>(i)[0] = 0;
-		v_prp_out.template get<1>(i)[1] = 0;
-		v_prp_out.template get<1>(i)[2] = 0;
+		v_prp.template get<1>(i)[0] = 0;
+		v_prp.template get<1>(i)[1] = 0;
+		v_prp.template get<1>(i)[2] = 0;
 
-		v_prp_out.template get<2>(i)[0] = 0;
-		v_prp_out.template get<2>(i)[1] = 0;
-		v_prp_out.template get<2>(i)[2] = 0;
+		v_prp.template get<2>(i)[0] = 0;
+		v_prp.template get<2>(i)[1] = 0;
+		v_prp.template get<2>(i)[2] = 0;
 
 		ns_to_s.template get<0>(i) = 10000-i-1;
 	}
@@ -895,7 +900,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 																								 v_pos_out.toKernel(),v_prp_out.toKernel(),
 																								 ns_to_s.toKernel());
 
-	v_prp_out.template deviceToHost<0,1,2>();
+	v_prp.template deviceToHost<0,1,2>();
 
 	bool match = true;
 	for (int i = 0 ; i < 10000 ; i++) // <------ particle id
@@ -904,13 +909,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 		match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
 		match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
 
-		match &= v_prp_out.template get<1>(10000-i-1)[0] == 0;
-		match &= v_prp_out.template get<1>(10000-i-1)[1] == 0;
-		match &= v_prp_out.template get<1>(10000-i-1)[2] == 0;
+		match &= v_prp.template get<1>(10000-i-1)[0] == 0;
+		match &= v_prp.template get<1>(10000-i-1)[1] == 0;
+		match &= v_prp.template get<1>(10000-i-1)[2] == 0;
 
-		match &= v_prp_out.template get<2>(10000-i-1)[0] == 0;
-		match &= v_prp_out.template get<2>(10000-i-1)[1] == 0;
-		match &= v_prp_out.template get<2>(10000-i-1)[2] == 0;
+		match &= v_prp.template get<2>(10000-i-1)[0] == 0;
+		match &= v_prp.template get<2>(10000-i-1)[1] == 0;
+		match &= v_prp.template get<2>(10000-i-1)[2] == 0;
 	}
 
 	BOOST_REQUIRE_EQUAL(match,true);
@@ -919,8 +924,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 																								 v_pos_out.toKernel(),v_prp_out.toKernel(),
 																								 ns_to_s.toKernel());
 
-	v_prp_out.template deviceToHost<0,1,2>();
-	v_pos_out.template deviceToHost<0>();
+	v_prp.template deviceToHost<0,1,2>();
+	v_pos.template deviceToHost<0>();
 
 	for (int i = 0 ; i < 10000 ; i++) // <------ particle id
 	{
@@ -937,9 +942,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 		match &= v_prp_out.template get<2>(10000-i-1)[2] == v_prp.template get<2>(i)[2];
 
 
-		match &= v_pos_out.template get<0>(10000-i-1)[0] == 0;
-		match &= v_pos_out.template get<0>(10000-i-1)[1] == 0;
-		match &= v_pos_out.template get<0>(10000-i-1)[2] == 0;
+		match &= v_pos.template get<0>(10000-i-1)[0] == 0;
+		match &= v_pos.template get<0>(10000-i-1)[1] == 0;
+		match &= v_pos.template get<0>(10000-i-1)[2] == 0;
 	}
 
 	BOOST_REQUIRE_EQUAL(match,true);
@@ -948,11 +953,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 																								 v_pos_out.toKernel(),v_prp_out.toKernel(),
 																								 ns_to_s.toKernel());
 
-	v_prp_out.template deviceToHost<0,1,2>();
-	v_pos_out.template deviceToHost<0>();
+	v_prp.template deviceToHost<0,1,2>();
+	v_pos.template deviceToHost<0>();
 
 	for (int i = 0 ; i < 10000 ; i++) // <------ particle id
 	{
+
+
 		match &= v_prp_out.template get<0>(10000-i-1)[0] == v_prp.template get<0>(i)[0];
 		match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
 		match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
@@ -1010,8 +1017,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
     	v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/RAND_MAX;
 
     	int seg = i / 10000;
-    	m_opart.template get<0>(i) = seg;
-    	m_opart.template get<1>(i) = (9999 - i%10000) + seg * 10000;
+    	m_opart.template get<1>(i) = seg;
+    	m_opart.template get<0>(i) = (9999 - i%10000) + seg * 10000;
     }
 
     m_pos.resize(10);
@@ -1045,24 +1052,24 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
 
 		for (size_t j = 0 ; j < m_pos.get(i).size() ; j++)
 		{
-			match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[0]);
-			match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[1]);
-			match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[2]);
-
-			match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<1>(offset+j)));
-
-			match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[0]);
-			match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[1]);
-
-			match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][0]);
-			match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][1]);
-			match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][2]);
-			match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][0]);
-			match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][1]);
-			match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][2]);
-			match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][0]);
-			match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][1]);
-			match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][2]);
+			match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[0]);
+			match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[1]);
+			match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[2]);
+
+			match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<0>(offset+j)));
+
+			match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<0>(offset+j))[0]);
+			match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<0>(offset+j))[1]);
+
+			match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][0]);
+			match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][1]);
+			match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][2]);
+			match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][0]);
+			match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][1]);
+			match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][2]);
+			match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][0]);
+			match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][1]);
+			match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][2]);
 		}
 
 		BOOST_REQUIRE_EQUAL(match,true);
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index 1e5a0f45..658f6002 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -63,7 +63,7 @@ __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
 		vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
 	}
 
-	vd_prp.template set<prp...>(p,vd_prp_ord,nss.template get<0>(p));
+	vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
 }
 
 template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
@@ -99,7 +99,7 @@ __global__  void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
 	{
     	int i = atomicAdd(cnt, 1);
     	offs.template get<0>(i) = p+1;
-    	offs.template get<1>(i) = vd.template get<1>(p);
+    	offs.template get<1>(i) = vd.template get<prp_off>(p);
 	}
 }
 
@@ -184,12 +184,13 @@ __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vecto
     v_prp.set(base+i,v_prp.get(pid));
 }
 
-template<unsigned int dim, typename St, typename vector_of_box, typename vector_type,  typename output_type>
-__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd,  output_type out)
+template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type,  typename output_type>
+__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd,  output_type out, unsigned int g_m)
 {
+	unsigned int old_shift = (unsigned int)-1;
 	int p = threadIdx.x + blockIdx.x * blockDim.x;
 
-    if (p >= vd.size()) return;
+    if (p >= g_m) return;
 
     Point<dim,St> xp = vd.template get<0>(p);
 
@@ -197,8 +198,14 @@ __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd,
 
     for (unsigned int i = 0 ; i < box_f.size() ; i++)
     {
-    	if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true)
-    	{n++;}
+    	unsigned int shift_actual = box_f_sv.template get<0>(i);
+    	bool sw = (old_shift == shift_actual)?true:false;
+
+    	if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
+    	{
+    		old_shift = shift_actual;
+    		n++;
+    	}
     }
 
     out.template get<0>(p) = n;
@@ -217,6 +224,7 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
 		                              start_type start, shifts_type shifts,
 		                              output_type output, unsigned int offset)
 {
+	unsigned int old_shift = (unsigned int)-1;
 	int p = threadIdx.x + blockIdx.x * blockDim.x;
 
     if (p >= v_pos.size()) return;
@@ -231,20 +239,23 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
 
     for (unsigned int i = 0 ; i < box_f.size() ; i++)
     {
-    	if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true)
+    	unsigned int shift_actual = box_f_sv.template get<0>(i);
+    	bool sw = (old_shift == shift_actual)?true:false;
+
+    	if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
     	{
-    		unsigned int shift_id = box_f_sv.template get<0>(i);
 
 #pragma unroll
     		for (unsigned int j = 0 ; j < dim ; j++)
     		{
-    			v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_id)[j];
+    			v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
     			output.template get<0>(base_o+n) = p;
-    			output.template get<1>(base_o+n) = shift_id;
+    			output.template get<1>(base_o+n) = shift_actual;
     		}
 
     		v_prp.set(base+n,v_prp.get(p));
 
+    		old_shift = shift_actual;
     		n++;
     	}
     }
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index 03150957..988c735d 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -50,8 +50,6 @@ __global__  void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
 
     auto it = cl.getNNIterator(cl.getCell(xp));
 
-    auto cell = cl.getCell(xp);
-
     Point<3,float> force1({0.0,0.0,0.0});
     Point<3,float> force2({0.0,0.0,0.0});
 
@@ -95,14 +93,10 @@ __global__  void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f
 	unsigned int p;
 	GET_PARTICLE_SORT(p,cl);
 
-	unsigned int ns_id = cl.getSortToNonSort().template get<0>(p);
-
 	Point<3,float> xp = vd.getPos(p);
 
     auto it = cl.getNNIterator(cl.getCell(xp));
 
-    auto cell = cl.getCell(xp);
-
     Point<3,float> force1({0.0,0.0,0.0});
 
     while (it.isNext())
@@ -159,6 +153,12 @@ bool check_force(CellList_type & NN_cpu, vector_type & vd)
 
 	    	// Normalize
 
+	    	if (r2.norm() == 0)
+	    	{
+	    		int debug = 0;
+	    		debug++;
+	    	}
+
 	    	r2 /= r2.norm();
 	    	force += vd.template getProp<0>(q)*r2;
 
@@ -314,15 +314,23 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
 
 	vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(10000,domain,bc,g);
 
+	srand(55067*create_vcluster().rank());
+
 	auto it = vd.getDomainIterator();
 
 	while (it.isNext())
 	{
 		auto p = it.get();
 
-		vd.getPos(p)[0] = (float)rand() / RAND_MAX;
-		vd.getPos(p)[1] = (float)rand() / RAND_MAX;
-		vd.getPos(p)[2] = (float)rand() / RAND_MAX;
+		int x = rand();
+		int y = rand();
+		int z = rand();
+
+		vd.getPos(p)[0] = (float)x / RAND_MAX;
+		vd.getPos(p)[1] = (float)y / RAND_MAX;
+		vd.getPos(p)[2] = (float)z / RAND_MAX;
+
+		Point<3,float> xp = vd.getPos(p);
 
 		++it;
 	}
@@ -358,8 +366,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
 	BOOST_REQUIRE_EQUAL(noOut,true);
 	BOOST_REQUIRE_EQUAL(cnt,vd.size_local());
 
-	vd.write("test_out_gpu");
-
 	// now we offload all the properties
 
 	auto it3 = vd.getDomainIteratorGPU();
@@ -426,6 +432,7 @@ void vdist_calc_gpu_test()
 
 	vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vd(1000,domain,bc,g);
 
+	srand(v_cl.rank()*10000);
 	auto it = vd.getDomainIterator();
 
 	while (it.isNext())
@@ -459,8 +466,6 @@ void vdist_calc_gpu_test()
 	vd.deviceToHostPos();
 	vd.template deviceToHostProp<0,1,2>();
 
-	vd.write("write_start");
-
 	// Reset the host part
 
 	auto it3 = vd.getDomainIterator();
@@ -542,9 +547,6 @@ void vdist_calc_gpu_test()
 		vd.deviceToHostPos();
 		vd.template deviceToHostProp<0,1,2>();
 
-		vd.write_frame("write_ggg",i);
-
-
 		// To test we copy on a cpu distributed vector and we do a map
 
 		vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
@@ -620,6 +622,8 @@ void vdist_calc_gpu_test()
 		cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
 		gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local());
 
+		BOOST_REQUIRE_EQUAL(cpu_sort.size(),gpu_sort.size());
+
 		size_t cnt = 0;
 
 		auto itc2 = vd.getGhostIterator();
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index a3287a83..7efbcfa5 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -1192,6 +1192,7 @@ public:
 		cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m);
 
 		cell_list.set_ndec(getDecomposition().get_ndec());
+		cell_list.set_gm(g_m);
 
 		return cell_list;
 	}
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index 3acb334f..5f9dd10c 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -8,7 +8,7 @@
 #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
 #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
 
-//#define TEST1
+#define TEST1
 
 #if defined(CUDA_GPU) && defined(__NVCC__)
 #include "util/cuda/moderngpu/kernel_mergesort.hxx"
@@ -285,6 +285,21 @@ class vector_dist_comm
 		if (shift_box_ndec == (long int)dec.get_ndec())
 			return;
 
+		struct sh_box
+		{
+			size_t shift_id;
+
+			unsigned int box_f_sv;
+			Box<dim,St> box_f_dev;
+
+			bool operator<(const sh_box & tmp)
+			{
+				return shift_id < tmp.shift_id;
+			}
+
+		};
+		openfpm::vector<struct sh_box> reord_shift;
+
 		// Add local particles coming from periodic boundary, the only boxes that count are the one
 		// touching the border
 		for (size_t i = 0; i < dec.getNLocalSub(); i++)
@@ -307,23 +322,32 @@ class vector_dist_comm
 					box_f.last().add(dec.getLocalIGhostBox(i, j));
 					box_cmb.add(dec.getLocalIGhostPos(i, j));
 					map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1;
-
-					box_f_dev.add(dec.getLocalIGhostBox(i, j));
-					box_f_sv.add();
-					box_f_sv.template get<0>(box_f_sv.size()-1) = dec.convertShift(dec.getLocalIGhostPos(i, j));
 				}
 				else
 				{
 					// we have it
 					box_f.get(it->second).add(dec.getLocalIGhostBox(i, j));
-
-					box_f_dev.add(dec.getLocalIGhostBox(i, j));
-					box_f_sv.template get<0>(box_f_sv.size()-1) = dec.convertShift(dec.getLocalIGhostPos(i, j));
 				}
 
+				reord_shift.add();
+				reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin();
+				reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j);
+				reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j));
 			}
 		}
 
+		// now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles
+		reord_shift.sort();
+
+		box_f_dev.resize(reord_shift.size());
+		box_f_sv.resize(reord_shift.size());
+
+		for (size_t i = 0 ; i < reord_shift.size() ; i++)
+		{
+			box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
+			box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
+		}
+
 #ifdef CUDA_GPU
 
 		// move box_f_dev and box_f_sv to device
@@ -547,7 +571,7 @@ class vector_dist_comm
 		const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors();
 
 		// create a number of send buffers equal to the near processors
-		g_pos_send.resize(g_opart.size());
+		g_pos_send.resize(prc_sz.size());
 
 		resize_retained_buffer(hsmem,g_pos_send.size());
 
@@ -894,7 +918,7 @@ class vector_dist_comm
 
 			size_t offset = prc_sz.template get<0>(0);
 
-			// Fill the sending fuffers
+			// Fill the sending buffers
 			for (size_t i = 0 ; i < m_pos.size() ; i++)
 			{
 				auto ite = m_pos.get(i).getGPUIterator();
@@ -1074,6 +1098,7 @@ class vector_dist_comm
 			<<<ite.wthr,ite.thr>>>
 			(dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
 
+
 			#ifndef TEST1
 
 			// sort particles
@@ -1123,9 +1148,10 @@ class vector_dist_comm
 		{
 			// reset lbl_p
 			lbl_p.clear();
+			prc_sz_gg.clear();
 			o_part_loc.clear();
 			g_opart.clear();
-			g_opart.resize(dec.getNNProcessors());
+			prc_g_opart.clear();
 
 			// resize the label buffer
 			prc_sz.template fill<0>(0);
diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp
index e3c332cf..d4f0a883 100644
--- a/src/Vector/vector_dist_kernel.hpp
+++ b/src/Vector/vector_dist_kernel.hpp
@@ -12,7 +12,7 @@
 
 #define POS_PROP -1
 
-#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x > vd.size()) {return;};
+#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= vd.size()) {return;};
 #define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\
 							  else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);}
 
-- 
GitLab