From ee814927210b04bdc469209f876a701bd301aea9 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Sat, 26 Oct 2019 22:39:58 +0200
Subject: [PATCH] ghost get for sparsegridgpu moving on + adding small function
 for particle optimizations

---
 example/Vector/7_SPH_dlb_gpu_opt/main.cu      |  2 +-
 openfpm_data                                  |  2 +-
 src/CMakeLists.txt                            |  4 +-
 .../tests/sgrid_dist_id_gpu_unit_tests.cu     |  2 +-
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    | 57 +++++++++++
 src/Vector/cuda/vector_dist_gpu_unit_tests.cu | 98 +++++++++++++++++++
 src/Vector/vector_dist.hpp                    | 14 +++
 7 files changed, 175 insertions(+), 4 deletions(-)

diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index 6c93494f0..fa6c0ea2a 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -482,7 +482,7 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
 		return;
 	}
 
-	//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
+	//-Calculate displacement and update position
 	real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
     real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
     real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
diff --git a/openfpm_data b/openfpm_data
index 02ef67f83..f8ea1b875 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 02ef67f834e1f2b36f781f380e436f821c9a7945
+Subproject commit f8ea1b875c24392c1a6991f4faa000e2981e96d1
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1b1f005cd..909a6a881 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -127,7 +127,8 @@ if (NOT APPLE)
     target_link_libraries(pdata rt)
 endif ()
 
-install(FILES Decomposition/CartDecomposition.hpp 
+install(FILES Decomposition/CartDecomposition.hpp
+       	      Decomposition/Domain_icells_cart.hpp	
 	      Decomposition/shift_vect_converter.hpp 
 	      Decomposition/CartDecomposition_ext.hpp  
 	      Decomposition/common.hpp 
@@ -185,6 +186,7 @@ install(FILES Vector/util/vector_dist_funcs.hpp
 
 install(FILES Vector/cuda/vector_dist_comm_util_funcs.cuh
 	      Vector/cuda/vector_dist_cuda_funcs.cuh
+	      Vector/cuda/vector_dist_operators_list_ker.hpp
 	DESTINATION openfpm_pdata/include/Vector/cuda )
 
 install(FILES Graph/ids.hpp Graph/dist_map_graph.hpp 
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index 8f0a5ced7..3ef536e5b 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -222,7 +222,7 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
 	gdist.template deviceToHost<0>();
 //	gdist.write("broken");
 
-	gdist.template ghost_get<0>(RUN_ON_DEVICE);
+//	gdist.template ghost_get<0>(RUN_ON_DEVICE);
 }
 
 
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index ea6b20f1b..e986746e5 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -14,6 +14,7 @@
 #include "Decomposition/common.hpp"
 #include "lib/pdata.hpp"
 #include "util/cuda/kernels.cuh"
+#include "util/cuda/scan_ofp.cuh"
 
 template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
 __global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
@@ -401,4 +402,60 @@ void remove_marked(vector_type & vd)
 	vd.getPropVector().swap(vd_prp_new);
 }
 
+template<unsigned int prp, typename functor, typename particles_type, typename out_type>
+__global__ void mark_indexes(particles_type vd, out_type out)
+{
+	auto a = GET_PARTICLE(vd);
+
+	out.template getProp<0>(a) = functor::check(vd.template getProp<prp>(a)) == true;
+}
+
+template<typename out_type, typename ids_type>
+__global__ void fill_indexes(out_type scan, ids_type ids)
+{
+	unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+
+	if (p >= scan.size()-1)	{return;}
+
+	auto sp = scan.template get<0>(p);
+	auto spp = scan.template get<0>(p+1);
+
+	if (sp != spp)
+	ids.template get<0>(scan.template get<0>(p)) = p;
+}
+
+/*! \brief get the particle index that satify the functor condition
+ *
+ * This function can be used to collect the indexes of the particles of a particular type.
+ * Write a functor that return true when a particle of a particular type is identified
+ * and ids will contain the indexes for which the functor return true.
+ *
+ * \tparam prp property to pass to the functor
+ *
+ * \param vd distributed vector
+ *
+ */
+template<typename functor, typename vector_type, typename ids_type>
+void get_indexes_sorted(vector_type & vd, ids_type & ids, mgpu::ofp_context_t & context)
+{
+	// first we do a scan of the property
+	openfpm::vector_gpu<aggregate<unsigned int>> scan;
+
+	scan.setMemory(mem_tmp);
+	scan.resize(vd.size_local_with_ghost()+1);
+
+	auto ite = scan.getGPUIterator();
+
+	CUDA_LAUNCH(mark_indexes,ite,vd.toKernel(),scan.toKernel());
+
+	openfpm::scan(scan.template getDeviceBuffer<0>(),scan.size(),scan.template getDeviceBuffer<0>(),context);
+
+	// get the number of marked particles
+	scan.template deviceToHost<0>(scan.size()-1,scan.size()-1);
+	size_t nf = scan.template get<0>(scan.size()-1);
+	ids.resize(nf);
+
+	CUDA_LAUNCH(fill_indexes,ite,scan.toKernel(),ids.toKernel());
+}
+
 #endif /* VECTOR_DIST_CUDA_FUNCS_CUH_ */
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index 475bbbd44..daa90b634 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -1488,6 +1488,104 @@ BOOST_AUTO_TEST_CASE(vector_dist_keep_prop_on_cuda)
 	}
 }
 
+BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
+{
+	Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+	Ghost<3,double> g(0.1);
+	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
+
+	if (create_vcluster().size() >= 16)
+	{return;}
+
+	vector_dist_gpu<3,double,aggregate<int,double>> vdg(10000,domain,bc,g,DEC_GRAN(128));
+
+	auto it = vdg.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto p = it.get();
+
+		vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
+		vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
+		vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
+
+		vdg.template getProp<0>(p) = (int)((double)rand() / RAND_MAX / 0.5);
+
+		vdg.template getProp<1>(p) = (double)rand() / RAND_MAX;
+
+		++it;
+	}
+
+	vdg.map();
+
+	vdg.hostToDeviceProp<0,1>();
+	vdg.hostToDevicePos();
+
+/*	bool test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+	vdg.getPos(100)[0] = 0.99999999;
+
+	test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,false);
+
+	vdg.hostToDevicePos();
+	vdg.getPos(100)[0] = 0.99999999;
+
+	test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+	////////////////////////////////////////////////// PROP VECTOR
+
+	test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+	vdg.getProp<1>(103)[0] = 0.99999999;
+
+	test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,false);
+
+	vdg.hostToDeviceProp<1>();
+	vdg.getProp<1>(103)[0] = 0.99999999;
+
+	test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+	////////////////////////////////////////////////// PROP scalar
+
+
+	test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+	vdg.getProp<0>(105) = 0.99999999;
+
+	test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,false);
+
+	vdg.hostToDeviceProp<0>();
+	vdg.getProp<0>(105) = 0.99999999;
+
+	test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+
+	////////////////////////////////////////////////// PROP scalar
+
+
+	test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);
+
+	vdg.getProp<2>(108)[1][2] = 0.99999999;
+
+	test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,false);
+
+	vdg.hostToDeviceProp<2>();
+	vdg.getProp<2>(108)[1][2] = 0.99999999;
+
+	test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
+	BOOST_REQUIRE_EQUAL(test,true);*/
+}
 
 BOOST_AUTO_TEST_CASE(vector_dist_compare_host_device)
 {
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 730a9c752..73c6c4f79 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -2017,6 +2017,20 @@ public:
 		return v_pos.getGPUIteratorTo(g_m,n_thr);
 	}
 
+	/*! \brief Get an iterator that traverse the particles in the domain
+	 *
+	 * \return an iterator
+	 *
+	 */
+	ite_gpu<1> getDomainAndGhostIteratorGPU(size_t n_thr = 1024) const
+	{
+#ifdef SE_CLASS3
+		se3.getIterator();
+#endif
+
+		return v_pos.getGPUIteratorTo(size_local(),n_thr);
+	}
+
 	/*! \brief Merge the properties calculated on the sorted vector on the original vector
 	 *
 	 * \parameter Cell-list from which has been constructed the sorted vector
-- 
GitLab