From e1b863f589732c8d07c9ea5cd98c6d46b54c4e6f Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Fri, 11 Dec 2020 12:47:26 +0100
Subject: [PATCH] Fixing ghost put on GPU

---
 src/CMakeLists.txt                            | 36 +++++++++++++++-
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    |  1 +
 src/Vector/cuda/vector_dist_gpu_unit_tests.cu | 41 +++++++++++++------
 src/Vector/util/vector_dist_funcs.hpp         |  1 +
 4 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 038eb87bc..49811b035 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -43,6 +43,14 @@ add_executable(pdata ${OPENFPM_INIT_FILE} ${CUDA_SOURCES} main.cpp
 														  ../openfpm_vcluster/src/VCluster/VCluster.cpp 
 														  )
 
+add_executable(isolation_pdata ${OPENFPM_INIT_FILE} isolation.cu
+														  lib/pdata.cpp  
+														  ../openfpm_devices/src/memory/HeapMemory.cpp 
+														  ../openfpm_devices/src/memory/CudaMemory.cu
+														  ../openfpm_devices/src/memory/PtrMemory.cpp 
+														  ../openfpm_vcluster/src/VCluster/VCluster.cpp 
+														  )
+
 if ( CMAKE_COMPILER_IS_GNUCC )
     target_compile_options(pdata PRIVATE "-Wno-deprecated-declarations")
     if (TEST_COVERAGE)
@@ -65,6 +73,7 @@ add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata)
 
 if (CUDA_FOUND)
 	target_compile_options(pdata PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
+	target_compile_options(isolation_pdata PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
 	target_include_directories (pdata PUBLIC ${MPI_C_INCLUDE_DIRS})
         if (TEST_COVERAGE)
 		target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
@@ -89,10 +98,25 @@ target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
 target_include_directories (pdata PUBLIC ${PETSC_INCLUDES})
 target_include_directories (pdata PUBLIC ${HDF5_ROOT}/include)
 target_include_directories (pdata PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
-
-target_include_directories(pdata PUBLIC ${Vc_INCLUDE_DIR})
+target_include_directories (pdata PUBLIC ${Vc_INCLUDE_DIR})
 target_include_directories (pdata PUBLIC ${Boost_INCLUDE_DIRS})
 
+target_include_directories (isolation_pdata PUBLIC ${PARMETIS_ROOT}/include)
+target_include_directories (isolation_pdata PUBLIC ${METIS_ROOT}/include)
+target_include_directories (isolation_pdata PUBLIC ${CUDA_INCLUDE_DIRS})
+target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/)
+target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/)
+target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/)
+target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/)
+target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
+target_include_directories (isolation_pdata PUBLIC ${PETSC_INCLUDES})
+target_include_directories (isolation_pdata PUBLIC ${HDF5_ROOT}/include)
+target_include_directories (isolation_pdata PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
+target_include_directories (isolation_pdata PUBLIC ${Vc_INCLUDE_DIR})
+target_include_directories (isolation_pdata PUBLIC ${Boost_INCLUDE_DIRS})
+
+
 target_link_libraries(pdata ${Boost_LIBRARIES})
 target_link_libraries(pdata ${PARMETIS_LIBRARIES})
 target_link_libraries(pdata -L${METIS_ROOT}/lib metis)
@@ -101,6 +125,14 @@ target_link_libraries(pdata -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES}
 target_link_libraries(pdata ${PETSC_LIBRARIES})
 target_link_libraries(pdata ${Vc_LIBRARIES})
 
+target_link_libraries(isolation_pdata ${Boost_LIBRARIES})
+target_link_libraries(isolation_pdata ${PARMETIS_LIBRARIES})
+target_link_libraries(isolation_pdata -L${METIS_ROOT}/lib metis)
+target_link_libraries(isolation_pdata ${HDF5_LIBRARIES})
+target_link_libraries(isolation_pdata -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
+target_link_libraries(isolation_pdata ${PETSC_LIBRARIES})
+target_link_libraries(isolation_pdata ${Vc_LIBRARIES})
+
 if (TEST_PERFORMANCE)
 	target_link_libraries(pdata  ${Boost_FILESYSTEM_LIBRARY})
         target_link_libraries(pdata ${Boost_SYSTEM_LIBRARY})
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index c5a19ccc3..5ec0008e6 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -130,6 +130,7 @@ __global__  void process_ghost_particles_prp(vector_g_opart_type g_opart, vector
     process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
 }
 
+
 template<typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
 __global__  void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
 		     	 	 	 	 	 	   	     vector_prp_type_in  v_prp, unsigned int offset)
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index b70bd00f9..fa3e9223d 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -1847,6 +1847,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1)
 }
 
 
+
 BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 {
 	Vcluster<> & v_cl = create_vcluster();
@@ -1878,7 +1879,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 		// ghost
 		Ghost<3,float> ghost(r_g);
 
-		typedef  aggregate<float> part_prop;
+		typedef  aggregate<float,float,float> part_prop;
 
 		// Distributed vector
 		vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost);
@@ -1899,16 +1900,18 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 
 			vd.getLastPropWrite<0>() = 0.0;
 
+			vd.getLastPropWrite<2>() = 0.0;
+
 			++it;
 		}
 
 		vd.map();
 
 		vd.hostToDevicePos();
-		vd.template hostToDeviceProp<0>();
+		vd.template hostToDeviceProp<0,2>();
 		// sync the ghost
-		vd.ghost_get<0>(RUN_ON_DEVICE);
-		vd.template deviceToHostProp<0>();
+		vd.ghost_get<0,2>(RUN_ON_DEVICE);
+		vd.template deviceToHostProp<0,2>();
 		vd.deviceToHostPos();
 
 		{
@@ -1937,7 +1940,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 					float dist = xp.distance(xq);
 
 					if (dist < r_cut)
+					{
 						vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
+						vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut) / 2;
+					}
 
 					++Np;
 				}
@@ -1946,25 +1952,27 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 			}
 
 			vd.hostToDevicePos();
-			vd.template hostToDeviceProp<0>();
-			vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
-			vd.template deviceToHostProp<0>();
+			vd.template hostToDeviceProp<0,2>();
+			vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE);
+			vd.template deviceToHostProp<0,2>();
 			vd.deviceToHostPos();
 
 			bool ret = true;
 			auto it3 = vd.getDomainIterator();
 
 			float constant = vd.getProp<0>(it3.get());
+			float constanta = vd.getProp<2>(it3.get());
 			float eps = 0.001;
 
 			while (it3.isNext())
 			{
 				float constant2 = vd.getProp<0>(it3.get());
-				if (fabs(constant - constant2)/constant > eps)
+				float constant3 = vd.getProp<2>(it3.get());
+				if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
 				{
 					Point<3,float> p = vd.getPosRead(it3.get());
 
-					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "    " << v_cl.getProcessUnitID() << std::endl;
+					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "/" << constant3 << "    " << v_cl.getProcessUnitID() << std::endl;
 					ret = false;
 					break;
 				}
@@ -1980,6 +1988,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 			auto key = itp.get();
 
 			vd.getPropWrite<0>(key) = 0.0;
+			vd.getPropWrite<2>(key) = 0.0;
 
 			++itp;
 		}
@@ -2010,7 +2019,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 					float dist = xp.distance(xq);
 
 					if (dist < r_cut)
+					{
 						vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
+						vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
+					}
 
 					++Np;
 				}
@@ -2019,25 +2031,28 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 			}
 
 			vd.hostToDevicePos();
-			vd.template hostToDeviceProp<0>();
+			vd.template hostToDeviceProp<0,2>();
 			vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
-			vd.template deviceToHostProp<0>();
+			vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE);
+			vd.template deviceToHostProp<0,2>();
 			vd.deviceToHostPos();
 
 			bool ret = true;
 			auto it3 = vd.getDomainIterator();
 
 			float constant = vd.getPropRead<0>(it3.get());
+			float constanta = vd.getPropRead<2>(it3.get());
 			float eps = 0.001;
 
 			while (it3.isNext())
 			{
 				float constant2 = vd.getPropRead<0>(it3.get());
-				if (fabs(constant - constant2)/constant > eps)
+				float constant3 = vd.getPropRead<0>(it3.get());
+				if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
 				{
 					Point<3,float> p = vd.getPosRead(it3.get());
 
-					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "    " << v_cl.getProcessUnitID() << std::endl;
+					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "/" << constant3 << "    " << v_cl.getProcessUnitID() << std::endl;
 					ret = false;
 					break;
 				}
diff --git a/src/Vector/util/vector_dist_funcs.hpp b/src/Vector/util/vector_dist_funcs.hpp
index 35f001aeb..6d86d4477 100644
--- a/src/Vector/util/vector_dist_funcs.hpp
+++ b/src/Vector/util/vector_dist_funcs.hpp
@@ -116,6 +116,7 @@ __device__ inline void process_map_device_particle(unsigned int i, unsigned int
 	proc_class::proc(i,id,v_prp,m_prp);
 }
 
+
 //! It process one particle
 template<typename Top, typename T2, typename T4, unsigned int ... prp>
 __device__ inline void process_ghost_device_particle_prp(unsigned int i, unsigned int offset, Top & g_opart, T2 & m_prp, T4 & v_prp)
-- 
GitLab