From b8e2251bb308666030f72d9c17e1d1c31f29ff04 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Tue, 3 Dec 2019 02:55:34 +0100
Subject: [PATCH] ghost_get for sgrid_dist_id on GPU working

---
 openfpm_data                                  |  2 +-
 src/Grid/grid_dist_id.hpp                     | 34 +++----
 src/Grid/grid_dist_id_comm.hpp                |  8 ++
 src/Grid/grid_dist_key.hpp                    | 16 +--
 .../tests/sgrid_dist_id_gpu_unit_tests.cu     | 98 +++++++++++++++++--
 5 files changed, 115 insertions(+), 43 deletions(-)

diff --git a/openfpm_data b/openfpm_data
index bcdcfeb82..aab95db8e 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit bcdcfeb82154f588c992b0c36ff1013a87a650e6
+Subproject commit aab95db8e340d0e9a8ce0094c0a5b832e0f05b8d
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index e3dcad616..504683791 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -124,8 +124,8 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 	//! Structure that divide the space into cells
 	CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
 
-	//! size of the insert pool on gpu
-	size_t gpu_insert_pool_size;
+	//! number of insert each GPU thread does
+	size_t gpu_n_insert_thread;
 
 	//! Communicator class
 	Vcluster<> & v_cl;
@@ -2679,21 +2679,24 @@ public:
 
 #ifdef __NVCC__
 
-	/*! \brief Set the size of the gpu insert buffer pool
+	/*! \brief Set the number inserts each GPU thread do
 	 *
-	 * Indicate the maximum number of inserts each GPU block can do
-	 *
-	 * \param size of the insert pool
+	 * \param n_ins number of insert per thread
 	 *
 	 */
-	void setInsertBuffer(size_t n_pool)
+	void setNumberOfInsertPerThread(size_t n_ins)
 	{
-		gpu_insert_pool_size = n_pool;
+		gpu_n_insert_thread = n_ins;
 	}
 
 	template<typename func_t,typename it_t, typename ... args_t>
 	void iterateGridGPU(it_t & it, args_t ... args)
 	{
+		// setGPUInsertBuffer must be called in anycase even with 0 points to insert
+		// the loop "it.isNextGrid()" does not guarantee to call it for all local grids
+		for (size_t i = 0 ; i < loc_grid.size() ; i++)
+		{loc_grid.get(i).setGPUInsertBuffer(0ul,1ul);}
+
 		while(it.isNextGrid())
 		{
 			Box<dim,size_t> b = it.getGridBox();
@@ -2702,7 +2705,7 @@ public:
 
 			auto ite = loc_grid.get(i).getGridGPUIterator(b.getKP1int(),b.getKP2int());
 
-			loc_grid.get(i).setGPUInsertBuffer(ite.nblocks(),gpu_insert_pool_size);
+			loc_grid.get(i).setGPUInsertBuffer(ite.nblocks(),ite.nthrs());
 			loc_grid.get(i).initializeGPUInsertBuffer();
 
 			ite_gpu_dist<dim> itd = ite;
@@ -2713,23 +2716,12 @@ public:
 				itd.start_base.set_d(j,0);
 			}
 
-			grid_apply_functor<<<ite.wthr,ite.thr>>>(loc_grid.get(i).toKernel(),itd,func_t(),args...);
+			CUDA_LAUNCH((grid_apply_functor),ite,loc_grid.get(i).toKernel(),itd,func_t(),args...);
 
 			it.nextGrid();
 		}
 	}
 
-	template<typename func_t, typename ... args_t>
-	void iterateGPU(args_t ... args)
-	{
-		for (int i = 0 ; i < loc_grid.size() ; i++)
-		{
-			auto & sp = loc_grid.get(i);
-
-			// TODO Launch a kernel on every sparse grid GPU
-		}
-	}
-
 	/*! \brief Move the memory from the device to host memory
 	 *
 	 */
diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp
index 2a11a3797..ea419e4e5 100644
--- a/src/Grid/grid_dist_id_comm.hpp
+++ b/src/Grid/grid_dist_id_comm.hpp
@@ -201,6 +201,9 @@ class grid_dist_id_comm
 											  const grid_sm<dim,void> & ginfo,
 											  bool use_bx_def)
 	{
+		for (size_t i = 0 ; i < loc_grid.size() ; i++)
+		{loc_grid.get(i).packReset();}
+
 		grid_key_dx<dim> cnt[1];
 		cnt[0].zero();
 
@@ -253,6 +256,11 @@ class grid_dist_id_comm
 				}
 			}
 		}
+
+		for (size_t i = 0 ; i < loc_grid.size() ; i++)
+		{
+			loc_grid.get(i).template removeCopyToFinalize<prp ...>(v_cl.getmgpuContext());
+		}
 	}
 
 	/*! \brief Sync the local ghost part
diff --git a/src/Grid/grid_dist_key.hpp b/src/Grid/grid_dist_key.hpp
index 96224dc12..666e2bd32 100644
--- a/src/Grid/grid_dist_key.hpp
+++ b/src/Grid/grid_dist_key.hpp
@@ -9,10 +9,10 @@ template<bool impl, typename grid_key_base, unsigned int dim>
 class move_impl
 {
 public:
-	static grid_dist_key_dx<dim> move(grid_key_base & key, size_t sub, size_t i, size_t s)
+	static grid_dist_key_dx<dim,grid_key_base> move(grid_key_base & key, size_t sub, size_t i, int s)
 	{
 		key.set_d(i,key.get(i) + s);
-		return grid_dist_key_dx<dim>(sub,key);
+		return grid_dist_key_dx<dim,grid_key_base>(sub,key);
 	}
 };
 
@@ -20,7 +20,7 @@ template<typename grid_key_base, unsigned int dim>
 class move_impl<false,grid_key_base,dim>
 {
 public:
-	static grid_dist_key_dx<dim> move(grid_key_base & key, size_t sub, size_t i, size_t s)
+	static grid_dist_key_dx<dim> move(grid_key_base & key, size_t sub, size_t i, int s)
 	{
 		std::cout << __FILE__ << ":" << __LINE__ << " Error move a key is not supported"
 				" directly acting on the grid key, please use the move function from the grid method" << std::endl;
@@ -119,19 +119,13 @@ public:
 	 * \return new key
 	 *
 	 */
-	inline grid_dist_key_dx<dim> move(size_t i,size_t s) const
+	inline grid_dist_key_dx<dim,base_key> move(size_t i,int s) const
 	{
-//		std::is_same<base_key,grid_key_dx<dim>>::value
-
 		auto key = getKey();
 
-		return move_impl<std::is_same<base_key,grid_key_dx<dim>>::value,
+		return move_impl<has_set_d<base_key>::value,
 				 decltype(this->getKey()),
 				 dim>::move(key,getSub(),i,s);
-
-/*		grid_key_dx<dim> key = getKey();
-		key.set_d(i,key.get(i) + s);
-		return grid_dist_key_dx<dim>(getSub(),key);*/
 	}
 
 	/*! \brief Create a new key moving the old one
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index eb1ecd752..f4a3d0b87 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -71,8 +71,6 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
 
 	/////// GPU Run kernel
 
-	gdist.setInsertBuffer(1);
-
 	float c = 5.0;
 
 	gdist.template iterateGridGPU<insert_kernel2D<0>>(it,c);
@@ -110,7 +108,6 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
 	Box<2,size_t> box3({3,3},{11,11});
 
 	auto it3 = gdist.getGridIterator(box3.getKP1(),box3.getKP2());
-	gdist.setInsertBuffer(128);
 
 	gdist.template iterateGridGPU<insert_kernel2D<0>>(it3,c);
 	gdist.template flush<smax_<0>>(flush_type::FLUSH_ON_DEVICE);
@@ -173,8 +170,6 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output )
 
 	/////// GPU Run kernel
 
-	gdist.setInsertBuffer(128);
-
 	float c = 5.0;
 
 	gdist.template iterateGridGPU<insert_kernel2D<0>>(it,c);
@@ -192,9 +187,8 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output )
 	BOOST_REQUIRE_EQUAL(true,test);
 }
 
-BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
+void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2])
 {
-	size_t sz[2] = {17,17};
 	periodicity<2> bc = {PERIODIC,PERIODIC};
 
 	Ghost<2,long int> g(1);
@@ -207,13 +201,11 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
 
 	/////// GPU insert + flush
 
-	Box<2,size_t> box({1,1},{15,15});
+	Box<2,size_t> box({1,1},{sz2[0],sz2[1]});
 	auto it = gdist.getGridIterator(box.getKP1(),box.getKP2());
 
 	/////// GPU Run kernel
 
-	gdist.setInsertBuffer(225);
-
 	float c = 5.0;
 
 	gdist.template iterateGridGPU<insert_kernel2D<0>>(it,c);
@@ -225,7 +217,93 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
 	gdist.template ghost_get<0>(RUN_ON_DEVICE);
 
 	gdist.template deviceToHost<0>();
+
+	// Now we check that ghost is correct
+
+	auto it2 = gdist.getDomainIterator();
+
+	bool match = true;
+
+	while (it2.isNext())
+	{
+		auto p = it2.get();
+
+		auto key = it2.getGKey(p);
+
+		auto p_xp1 = p.move(0,1);
+		auto p_xm1 = p.move(0,-1);
+		auto p_yp1 = p.move(1,1);
+		auto p_ym1 = p.move(1,-1);
+
+		auto key_xp1 = key.move(0,1);
+		auto key_xm1 = key.move(0,-1);
+		auto key_yp1 = key.move(1,1);
+		auto key_ym1 = key.move(1,-1);
+
+		if (box.isInside(key_xp1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_xp1) == c + key_xp1.get(0) + key_xp1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_xp1) << "   " << c + key_xp1.get(0) + key_xp1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		if (box.isInside(key_xm1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_xm1) == c + key_xm1.get(0) + key_xm1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_xm1) << "   " << c + key_xm1.get(0) + key_xm1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		if (box.isInside(key_yp1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_yp1) == c + key_yp1.get(0) + key_yp1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_yp1) << "   " << c + key_yp1.get(0) + key_yp1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		if (box.isInside(key_ym1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_ym1) == c + key_ym1.get(0) + key_ym1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_ym1) << "   " << c + key_ym1.get(0) + key_ym1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		++it2;
+	}
+
 	gdist.write("after_ghost");
+
+	BOOST_REQUIRE_EQUAL(match,true);
+}
+
+BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
+{
+	size_t sz[2] = {17,17};
+	size_t sz6[2] = {15,15};
+	sgrid_ghost_get(sz,sz6);
+
+	size_t sz2[2] = {170,170};
+	size_t sz3[2] = {15,15};
+	sgrid_ghost_get(sz2,sz3);
+
+	size_t sz4[2] = {168,168};
+	sgrid_ghost_get(sz2,sz4);
 }
 
 
-- 
GitLab