From 635c72dee226ba2acf97cd2f3bbfeb1e875e05e7 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Sat, 2 Nov 2019 20:03:32 +0100
Subject: [PATCH] Adding funcionalities + speed up some others

---
 example/Vector/7_SPH_dlb_gpu/main.cu          |   2 +-
 example/Vector/7_SPH_dlb_gpu_opt/main.cu      |   2 +-
 openfpm_data                                  |   2 +-
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    | 104 ++++++++++--------
 src/Vector/cuda/vector_dist_gpu_unit_tests.cu |  80 +++++---------
 src/Vector/vector_dist.hpp                    |  36 ++++--
 src/Vector/vector_dist_kernel.hpp             |   4 +
 7 files changed, 116 insertions(+), 114 deletions(-)

diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu
index fbf069a98..08d91fbe2 100644
--- a/example/Vector/7_SPH_dlb_gpu/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu/main.cu
@@ -91,7 +91,7 @@ const real_number MassBound = 0.000614125;
 #ifdef TEST_RUN
 const real_number t_end = 0.001;
 #else
-const real_number t_end = 0.003;
+const real_number t_end = 1.5;
 #endif
 
 // Gravity acceleration
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index fa6c0ea2a..74b6f3c32 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -99,7 +99,7 @@ const real_number MassBound = 0.0000767656;
 #ifdef TEST_RUN
 const real_number t_end = 0.001;
 #else
-const real_number t_end = 1.5;
+const real_number t_end = 0.003;
 #endif
 
 // Gravity acceleration
diff --git a/openfpm_data b/openfpm_data
index f8ea1b875..a83fa7208 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit f8ea1b875c24392c1a6991f4faa000e2981e96d1
+Subproject commit a83fa72085b240324d58ee5583d53ca241a608a8
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index e986746e5..b42cb9351 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -298,8 +298,8 @@ __global__ void create_index(vector_type vd)
     vd.template get<0>(i) = i;
 }
 
-template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename scan_type>
-__global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type idx)
+template<unsigned int dim, typename vector_pos_type, typename vector_prp_type, typename ids_type>
+__global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, ids_type idx)
 {
     int i = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -311,6 +311,34 @@ __global__ void copy_new_to_old(vector_pos_type vd_pos_dst, vector_prp_type vd_p
     vd_prp_dst.set(i,vd_prp_src,idx.template get<0>(i));
 }
 
+template<unsigned int dim, unsigned int prp, typename vector_pos_type, typename vector_prp_type, typename scan_type>
+__global__ void copy_new_to_old_by_scan(vector_pos_type vd_pos_dst, vector_prp_type vd_prp_dst, vector_pos_type vd_pos_src, vector_prp_type vd_prp_src, scan_type scan)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (i >= scan.size()) return;
+
+    auto sc = scan.template get<0>(i);
+
+    if (vd_prp_src.template get<prp>(i) == 0) return;
+
+    for (unsigned int k = 0 ; k < dim ; k++)
+    {vd_pos_dst.template get<0>(sc)[k] = vd_pos_src.template get<0>(i)[k];}
+
+    vd_prp_dst.set(sc,vd_prp_src,i);
+}
+
+
+template<unsigned int prp,typename vector_type>
+__global__ void flip_one_to_zero(vector_type vd)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (i >= vd.size_local()) return;
+
+    vd.template getProp<prp>(i) = (vd.template getProp<prp>(i) == 0);
+}
+
 /*! \brief Remove the particles marked on the properties prp (particles marked has has property set to 1, the others to 0)
  *
  * \warning the function is destructive on prp, it mean that after destruction the prp of the particles can contain garbage
@@ -336,65 +364,49 @@ void remove_marked(vector_type & vd)
 
 	typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<prp>>::type remove_type;
 
-	// first we do a scan of the property
-	openfpm::vector_gpu<aggregate<unsigned int>> idx;
+	// because we mark the one to remove we flip the one to zero and the zeros to one
 
-	idx.setMemory(mem_tmp);
-	idx.resize(vd.size_local());
+	auto ite = vd.getDomainIteratorGPU();
 
-	auto ite = idx.getGPUIterator();
+	CUDA_LAUNCH((flip_one_to_zero<prp>),ite,vd.toKernel());
 
-	CUDA_LAUNCH(create_index,ite,idx.toKernel());
+	// first we scan
 
-	// sort particles, so the particles to remove stay at the end
-	mergesort((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),(unsigned int *)idx.template getDeviceBuffer<0>(), idx.size(), mgpu::template less_t<remove_type>(), vd.getVC().getmgpuContext());
+	openfpm::vector_gpu<aggregate<remove_type>> idx;
 
-	openfpm::vector_gpu<aggregate<int>> mark;
-	mark.resize(1);
+	idx.setMemory(mem_tmp);
+	idx.resize(vd.size_local());
 
-	CudaMemory mem;
-	mem.allocate(sizeof(int));
-	mem.fill(0);
+	openfpm::scan((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),vd.size_local(),(remove_type *)idx.template getDeviceBuffer<0>(),vd.getVC().getmgpuContext());
 
-	// mark point, particle that stay and to remove
-	CUDA_LAUNCH((find_buffer_offsets_no_prc<prp,decltype(vd.getPropVector().toKernel()),decltype(mark.toKernel())>),ite,
-			     vd.getPropVector().toKernel(),(int *)mem.getDevicePointer(),mark.toKernel(),vd.size_local());
+	// Check if we marked something
 
-	mem.deviceToHost();
+	idx.template deviceToHost<0>(idx.size()-1,idx.size()-1);
+	vd.template deviceToHostProp<prp>(vd.size_local()-1,vd.size_local()-1);
 
-	// we have no particles to remove
-	if (*(int *)mem.getPointer() != 1)
+	int n_marked = vd.size_local() - (vd.template getProp<prp>(vd.size_local()-1) + idx.template get<0>(idx.size()-1));
+
+	if (n_marked == 0)
 	{
-		if (*(int *)mem.getPointer() >= 2)
-		{
-			std::cout << __FILE__ << ":" << __LINE__ << " error: removing marked particle. Carefull particle must be marked with 1 or 0, no other numbers" << std::endl;
-		}
+		// No particle has been marked Nothing to do
+
 		return;
 	}
 
-	// Get the mark point
-	mark.template deviceToHost<0>();
-
-	// than create an equivalent buffer prop and pos
+	/////////////////////////////////
 
 	typename std::remove_reference<decltype(vd.getPosVector())>::type vd_pos_new;
 	typename std::remove_reference<decltype(vd.getPropVector())>::type vd_prp_new;
 
 	// resize them
 
-	vd_pos_new.resize(mark.template get<0>(0));
-	vd_prp_new.resize(mark.template get<0>(0));
+	vd_pos_new.resize(vd.size_local() - n_marked);
+	vd_prp_new.resize(vd.size_local() - n_marked);
 
 	auto & vd_pos_old = vd.getPosVector();
 	auto & vd_prp_old = vd.getPropVector();
 
-	// now we copy from the old vector to the new one
-
-	ite = vd_pos_old.getGPUIterator();
-
-	CUDA_LAUNCH((copy_new_to_old<vector_type::dims>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
-
-	// and we swap
+	CUDA_LAUNCH((copy_new_to_old_by_scan<vector_type::dims,prp>),ite,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
 
 	vd.set_g_m(vd_pos_new.size());
 
@@ -405,9 +417,11 @@ void remove_marked(vector_type & vd)
 template<unsigned int prp, typename functor, typename particles_type, typename out_type>
 __global__ void mark_indexes(particles_type vd, out_type out)
 {
-	auto a = GET_PARTICLE(vd);
+	unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+
+	if (p >= vd.size())	{return;}
 
-	out.template getProp<0>(a) = functor::check(vd.template getProp<prp>(a)) == true;
+	out.template get<0>(p) = functor::check(vd.template get<prp>(p)) == true;
 }
 
 template<typename out_type, typename ids_type>
@@ -435,20 +449,20 @@ __global__ void fill_indexes(out_type scan, ids_type ids)
  * \param vd distributed vector
  *
  */
-template<typename functor, typename vector_type, typename ids_type>
-void get_indexes_sorted(vector_type & vd, ids_type & ids, mgpu::ofp_context_t & context)
+template<unsigned int prp, typename functor, typename vector_type, typename ids_type>
+void get_indexes_by_type(vector_type & vd, ids_type & ids, mgpu::ofp_context_t & context)
 {
 	// first we do a scan of the property
 	openfpm::vector_gpu<aggregate<unsigned int>> scan;
 
 	scan.setMemory(mem_tmp);
-	scan.resize(vd.size_local_with_ghost()+1);
+	scan.resize(vd.size()+1);
 
 	auto ite = scan.getGPUIterator();
 
-	CUDA_LAUNCH(mark_indexes,ite,vd.toKernel(),scan.toKernel());
+	CUDA_LAUNCH((mark_indexes<prp,functor>),ite,vd.toKernel(),scan.toKernel());
 
-	openfpm::scan(scan.template getDeviceBuffer<0>(),scan.size(),scan.template getDeviceBuffer<0>(),context);
+	openfpm::scan((unsigned int *)scan.template getDeviceBuffer<0>(),scan.size(),(unsigned int *)scan.template getDeviceBuffer<0>(),context);
 
 	// get the number of marked particles
 	scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index daa90b634..56994bb9a 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -1488,6 +1488,14 @@ BOOST_AUTO_TEST_CASE(vector_dist_keep_prop_on_cuda)
 	}
 }
 
+struct type_is_one
+{
+	__device__ static bool check(int c)
+	{
+		return c == 1;
+	}
+};
+
 BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
 {
 	Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
@@ -1497,6 +1505,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
 	if (create_vcluster().size() >= 16)
 	{return;}
 
+	Vcluster<> & v_cl = create_vcluster();
+
 	vector_dist_gpu<3,double,aggregate<int,double>> vdg(10000,domain,bc,g,DEC_GRAN(128));
 
 	auto it = vdg.getDomainIterator();
@@ -1521,70 +1531,30 @@ BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
 	vdg.hostToDeviceProp<0,1>();
 	vdg.hostToDevicePos();
 
-/*	bool test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
-
-	vdg.getPos(100)[0] = 0.99999999;
-
-	test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,false);
-
-	vdg.hostToDevicePos();
-	vdg.getPos(100)[0] = 0.99999999;
-
-	test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
-
-	////////////////////////////////////////////////// PROP VECTOR
+	auto cl = vdg.getCellListGPU(0.1);
 
-	test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
-
-	vdg.getProp<1>(103)[0] = 0.99999999;
+	// than we get a cell-list to force reorder
 
-	test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,false);
+	openfpm::vector_gpu<aggregate<unsigned int>> ids;
 
-	vdg.hostToDeviceProp<1>();
-	vdg.getProp<1>(103)[0] = 0.99999999;
+	get_indexes_by_type<0,type_is_one>(vdg.getPropVectorSort(),ids,v_cl.getmgpuContext());
 
-	test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
+	// test
 
-	////////////////////////////////////////////////// PROP scalar
+	ids.template deviceToHost<0>();
 
+	auto & vs = vdg.getPropVectorSort();
+	vs.template deviceToHost<0>();
 
-	test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
-
-	vdg.getProp<0>(105) = 0.99999999;
-
-	test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,false);
-
-	vdg.hostToDeviceProp<0>();
-	vdg.getProp<0>(105) = 0.99999999;
-
-	test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
-
-
-	////////////////////////////////////////////////// PROP scalar
-
-
-	test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);
-
-	vdg.getProp<2>(108)[1][2] = 0.99999999;
-
-	test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,false);
+	bool match = true;
 
-	vdg.hostToDeviceProp<2>();
-	vdg.getProp<2>(108)[1][2] = 0.99999999;
+	for (int i = 0 ; i < ids.size() ; i++)
+	{
+		if (vs.template get<0>(ids.template get<0>(i)) != 1)
+		{match = false;}
+	}
 
-	test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
-	BOOST_REQUIRE_EQUAL(test,true);*/
+	BOOST_REQUIRE_EQUAL(match,true);
 }
 
 BOOST_AUTO_TEST_CASE(vector_dist_compare_host_device)
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 73c6c4f79..d69adf2a8 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -74,7 +74,7 @@ constexpr int GCL_NON_SYMMETRIC = 0;
 constexpr int GCL_SYMMETRIC = 1;
 constexpr int GCL_HILBERT = 2;
 
-template<bool is_gpu_celllist>
+template<bool is_gpu_celllist, unsigned int ... prp>
 struct gcl_standard_no_symmetric_impl
 {
 	template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl>
@@ -84,18 +84,18 @@ struct gcl_standard_no_symmetric_impl
 	}
 };
 
-template<>
-struct gcl_standard_no_symmetric_impl<true>
+template<unsigned int ... prp>
+struct gcl_standard_no_symmetric_impl<true,prp...>
 {
 	template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl>
 	static inline CellL get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g)
 	{
-		return vd.template getCellListGPU<CellL>(r_cut);
+		return vd.template getCellListGPU<CellL,prp...>(r_cut);
 	}
 };
 
 //! General function t get a cell-list
-template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl>
+template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl, unsigned int ... prp>
 struct gcl
 {
 	/*! \brief Get the Cell list based on the type
@@ -109,7 +109,7 @@ struct gcl
 	 */
 	static inline CellL get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g)
 	{
-		return gcl_standard_no_symmetric_impl<is_gpu_celllist<CellL>::value>::template get<dim,St,CellL,Vector,impl>(vd,r_cut,g);
+		return gcl_standard_no_symmetric_impl<is_gpu_celllist<CellL>::value,prp ...>::template get<dim,St,CellL,Vector,impl>(vd,r_cut,g);
 	}
 };
 
@@ -1209,7 +1209,7 @@ public:
 	 * \return the Cell list
 	 *
 	 */
-	template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>>
+	template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>,unsigned int ... prp>
 	CellType getCellListGPU(St r_cut, bool no_se3 = false)
 	{
 #ifdef SE_CLASS3
@@ -1243,7 +1243,7 @@ public:
 	 * \return the CellList
 	 *
 	 */
-	template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>>
+	template<typename CellType = CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>>, unsigned int ... prp>
 	CellType getCellListGPU(St r_cut, const Ghost<dim, St> & enlarge, bool no_se3 = false)
 	{
 #ifdef SE_CLASS3
@@ -1265,7 +1265,7 @@ public:
 		v_prp_out.resize(v_pos.size());
 		v_pos_out.resize(v_pos.size());
 
-		cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m);
+		cell_list.template construct<decltype(v_pos),decltype(v_prp),prp ...>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m);
 
 		cell_list.set_ndec(getDecomposition().get_ndec());
 		cell_list.set_gm(g_m);
@@ -1349,7 +1349,8 @@ public:
 	 * \param no_se3 avoid se class 3 checking
 	 *
 	 */
-	template<typename CellL> void updateCellList(CellL & cell_list, bool no_se3 = false, cl_construct_opt opt = cl_construct_opt::Full)
+	template<unsigned int ... prp,typename CellL>
+	void updateCellList(CellL & cell_list, bool no_se3 = false, cl_construct_opt opt = cl_construct_opt::Full)
 	{
 #ifdef SE_CLASS3
 		if (no_se3 == false)
@@ -1366,7 +1367,7 @@ public:
 
 		if (to_reconstruct == false)
 		{
-			populate_cell_list(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,v_cl.getmgpuContext(false),g_m,CL_NON_SYMMETRIC,opt);
+			populate_cell_list<dim,St,prop,Memory,layout_base,CellL,prp ...>(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,v_cl.getmgpuContext(false),g_m,CL_NON_SYMMETRIC,opt);
 
 			cell_list.set_gm(g_m);
 		}
@@ -2989,6 +2990,19 @@ public:
 			v_prp.template deviceToHost<prp ...>();
 		}
 
+		/*! \brief Move the memory from the device to host memory
+		 *
+		 * \tparam property to move use POS_PROP for position property
+		 *
+		 * \param start point
+		 * \param stop point (included)
+		 *
+		 */
+		template<unsigned int ... prp> void deviceToHostProp(size_t start, size_t stop)
+		{
+			v_prp.template deviceToHost<prp ...>(start,stop);
+		}
+
 		/*! \brief Move the memory from the device to host memory
 		 *
 		 * \tparam property to move use POS_PROP for position property
diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp
index 7c99a50da..4e825223c 100644
--- a/src/Vector/vector_dist_kernel.hpp
+++ b/src/Vector/vector_dist_kernel.hpp
@@ -16,6 +16,10 @@
 #define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\
 							  else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);}
 
+
+#define GET_PARTICLE_BY_ID(p,ids) if (blockDim.x*blockIdx.x + threadIdx.x >= ids.size()) {return;}\
+							  else{p = ids.template get<0>(blockDim.x*blockIdx.x + threadIdx.x);}
+
 /*! \brief this class is a functor for "for_each" algorithm
  *
  * This class is a functor for "for_each" algorithm. For each
-- 
GitLab