From 8c2a0b50809bd9e3ced6415a5137c4b30828f56d Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Tue, 18 Dec 2018 17:44:08 +0100
Subject: [PATCH] Latest update

---
 build.sh                                      |  9 ++++---
 example/Vector/0_simple/main.cpp              |  2 ++
 example/Vector/7_SPH_dlb_gpu/main.cu          | 26 ++++++++++++++++---
 example/Vector/7_SPH_dlb_gpu_opt/main.cu      |  8 +++---
 example/Vector/7_SPH_dlb_opt/main_dbg.cpp     | 19 --------------
 openfpm_io                                    |  2 +-
 src/Grid/grid_dist_id.hpp                     |  2 +-
 src/Grid/tests/grid_dist_id_unit_test.cpp     |  4 ---
 src/Vector/cuda/vector_dist_gpu_unit_tests.cu | 13 ++++++++++
 src/Vector/vector_dist.hpp                    | 19 ++++++++++----
 10 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/build.sh b/build.sh
index 34810b769..bf6365ff7 100755
--- a/build.sh
+++ b/build.sh
@@ -17,18 +17,19 @@ echo "Branch name: $branch"
 rm -rf $HOME/openfpm_dependencies/openfpm_pdata/0
 
 if [ x"$hostname" == x"cifarm-centos-node.mpi-cbg.de"  ]; then
-	./install_MPI_mpich.sh $HOME/openfpm_dependencies/openfpm_pdata/$branch/ 4
+	./install_MPI_mpich.sh $HOME/openfpm_dependencies/openfpm_io/$branch/ 4
 	echo 4 > $HOME/openfpm_dependencies/openfpm_pdata/$branch/MPI/version
-	rm -rf $HOME/openfpm_dependencies/openfpm_pdata/full
 fi
 
 if [ x"$hostname" == x"cifarm-ubuntu-node.mpi-cbg.de"  ]; then
-	rm -rf $HOME/openfpm_dependencies/openfpm_pdata/full
+        ./install_MPI_mpich.sh $HOME/openfpm_dependencies/openfpm_io/$branch/ 4
+        echo 4 > $HOME/openfpm_dependencies/openfpm_pdata/$branch/MPI/version
 fi
 
 if [ x"$hostname" == x"cifarm-mac-node.mpi-cbg.de"  ]; then
 	export PATH="/usr/local/bin:$PATH"
-	rm -rf $HOME/openfpm_dependencies/openfpm_pdata/full
+        ./install_MPI_mpich.sh $HOME/openfpm_dependencies/openfpm_io/$branch/ 4
+        echo 4 > $HOME/openfpm_dependencies/openfpm_io/$branch/MPI/version
 fi
 
 
diff --git a/example/Vector/0_simple/main.cpp b/example/Vector/0_simple/main.cpp
index a93f5302e..b056ed10e 100644
--- a/example/Vector/0_simple/main.cpp
+++ b/example/Vector/0_simple/main.cpp
@@ -17,8 +17,10 @@
  * \subpage Vector_6_complex_usage
  * \subpage Vector_7_sph_dlb
  * \subpage Vector_7_sph_dlb_opt
+ * \subpage Vector_7_sph_dlb_gpu
  * \subpage Vector_7_sph_dlb_gpu_opt
  * \subpage Vector_8_DEM
+ * \subpage Vector_9_gpu_cuda_interop
  *
  */
 
diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu
index 4d0a58f75..32c330aa3 100644
--- a/example/Vector/7_SPH_dlb_gpu/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu/main.cu
@@ -8,7 +8,7 @@
  *
  *
  * This example show the classical SPH Dam break simulation with load balancing and dynamic load balancing. The main difference with
- * \ref{SPH_dlb} is that here we use GPU and 1.2 Millions particles.
+ * \ref SPH_dlb is that here we use GPUs and 1.2 Millions particles.
  *
  * \htmlonly
  * <a href="#" onclick="hide_show('vector-video-3')" >Simulation video 1</a><br>
@@ -25,8 +25,21 @@
  * </div>
  * \endhtmlonly
  *
+ * This example use all the features explained in example \ref e3_md_gpu. Additionally this example show how to remove particles
+ * on GPU using a bulk remove function on GPU
  *
- * \snippet Vector/7_SPH_dlb_gpu_opt/main.cpp inclusion
+ * ## Bulk remove
+ *
+ * On SPH we have the necessity to remove particles that go out of bound. OpenFPM provide the function \b remove_marked \b .
+ *
+ * \snippet Vector/7_SPH_dlb_gpu/main.cu remove_marked_part
+ *
+ * where vd is the vector_dist_gpu red is the property that mark which particle must be removed. We mark the particle to be removed in the function kernel
+ * We check if the particle go out of the region of interest or their density go critically far from the rest density
+ *
+ * \snippet Vector/7_SPH_dlb_gpu/main.cu mark_to_remove_kernel
+ *
+ * \include Vector/7_SPH_dlb_gpu_opt/main.cu
  *
  */
 
@@ -523,7 +536,9 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
 	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity_prev>(a)[2] + vd.template getProp<force>(a)[2]*dt2;
 	vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
 
-    // Check if the particle go out of range in space and in density
+	//! \cond [mark_to_remove_kernel] \endcond
+
+    // Check if the particle go out of range in space and in density, if they do mark them to remove it later
     if (vd.getPos(a)[0] <  0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 ||
         vd.getPos(a)[0] >  0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.50 ||
 		vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
@@ -531,6 +546,7 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
     else
     {vd.template getProp<red>(a) = 0;}
 
+    //! \cond [mark_to_remove_kernel] \endcond
 
     vd.template getProp<velocity_prev>(a)[0] = velX;
     vd.template getProp<velocity_prev>(a)[1] = velY;
@@ -550,9 +566,13 @@ void verlet_int(particles & vd, real_number dt)
 
 	verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
 
+	//! \cond [remove_marked_part] \endcond
+
 	// remove the particles marked
 	remove_marked<red>(vd);
 
+	//! \cond [remove_marked_part] \endcond
+
 	// increment the iteration counter
 	cnt++;
 }
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index ba92a1a6c..111ce72fb 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -4,7 +4,7 @@
  * [TOC]
  *
  *
- * # SPH with Dynamic load Balancing on GPU # {#SPH_dlb_gpu_opt}
+ * # SPH with Dynamic load Balancing on GPU (Optimized) # {#SPH_dlb_gpu_opt}
  *
  *
  * This example show the classical SPH Dam break simulation with load balancing and dynamic load balancing. The main difference with
@@ -29,9 +29,11 @@
  *
  * ## GPU ## {#e7_sph_inclusion}
  *
- * This example is the port on GPU of the following example \ref{SPH_dlb}
+ * This example is an optimization of the example \ref SPH_dlb_gpu all the optimization operated on this example has been explained
+ * here \ref e3_md_gpu_opt so we will not go into the details
+ *
+ * we report the full code here
  *
- * \snippet Vector/7_SPH_dlb_gpu_opt/main.cpp inclusion
  *
  */
 
diff --git a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
index d49c96ad6..be6ec8ec7 100644
--- a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
+++ b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
@@ -1,22 +1,3 @@
-/*!
- * \page Vector_7_sph_dlb_dbg Vector 7 SPH Dam break simulation (Debugging video)
- *
- *
- * [TOC]
- *
- *
- * # SPH with Dynamic load Balancing (Debugging video) # {#SPH_dlb}
- *
- * \htmlonly
- * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/7_SPH_dlb/dam_break_all.jpg"/>
- * \endhtmlonly
- *
- * ## Inclusion ## {#e7_sph_inclusion}
- *
- *
- * \snippet Vector/7_SPH_dlb/main.cpp inclusion
- *
- */
 
 //#define SE_CLASS1
 //#define STOP_ON_ERROR
diff --git a/openfpm_io b/openfpm_io
index 4b2e3c09b..59be4882c 160000
--- a/openfpm_io
+++ b/openfpm_io
@@ -1 +1 @@
-Subproject commit 4b2e3c09bc57921cf0e1bb2f1056ec93cbb9b090
+Subproject commit 59be4882c3060f71b10057707b3b2699a01a2643
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 7aaacaed4..e053a9707 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -60,7 +60,7 @@ struct Box_fix
  * ### Synchronize a distributed grid for complex structures
  * \snippet grid_dist_id_unit_test.cpp Synchronized distributed grid complex
  * ### Usage of a grid dist iterator sub
- * \snippet grid_dist_id_unit_test.cpp Usage of a sub_grid iterator
+ * \snippet grid_dist_id_iterators_unit_tests.hpp Usage of a sub_grid iterator
  * ### Construct two grid with the same decomposition
  * \snippet grid_dist_id_unit_test.cpp Construct two grid with the same decomposition
  *
diff --git a/src/Grid/tests/grid_dist_id_unit_test.cpp b/src/Grid/tests/grid_dist_id_unit_test.cpp
index 6c077f7b9..7b8b480c3 100644
--- a/src/Grid/tests/grid_dist_id_unit_test.cpp
+++ b/src/Grid/tests/grid_dist_id_unit_test.cpp
@@ -290,8 +290,6 @@ void Test1D(const Box<1,float> & domain, long int k)
 	{
 		BOOST_TEST_CHECKPOINT( "Testing 1D grid k=" << k );
 
-		//! [Create and access a distributed grid]
-
 		// grid size
 		size_t sz[1];
 		sz[0] = k;
@@ -329,8 +327,6 @@ void Test1D(const Box<1,float> & domain, long int k)
 			++dom;
 		}
 
-		//! [Create and access a distributed grid]
-
 		// Get the virtual cluster machine
 		Vcluster<> & vcl = g_dist.getVC();
 
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index 8bb80a1e3..9f8ff1faa 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -430,8 +430,14 @@ void vdist_calc_gpu_test()
 	// Boundary conditions
 	size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
 
+	//! [Create a gpu vector]
+
 	vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vd(1000,domain,bc,g);
 
+	//! [Create a gpu vector]
+
+	//! [Fill gpu vector and move to GPU]
+
 	srand(v_cl.rank()*10000);
 	auto it = vd.getDomainIterator();
 
@@ -463,6 +469,8 @@ void vdist_calc_gpu_test()
 	// Ok we redistribute the particles (GPU based)
 	vd.map(RUN_ON_DEVICE);
 
+	//! [Fill gpu vector and move to GPU]
+
 	vd.deviceToHostPos();
 	vd.template deviceToHostProp<0,1,2>();
 
@@ -577,8 +585,13 @@ void vdist_calc_gpu_test()
 		}
 
 		vd_cpu.template ghost_get<0,1,2>();
+
+		//! [Fill the ghost on GPU]
+
 		vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
 
+		//! [Fill the ghost on GPU]
+
 		vd.deviceToHostPos();
 		vd.template deviceToHostProp<0,1,2>();
 
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 325e3c3f4..5e924f579 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -191,20 +191,29 @@ enum reorder_opt
 
 /*! \brief Distributed vector
  *
- * This class reppresent a distributed vector, the distribution of the structure
+ * This class represent a distributed vector, the distribution of the structure
  * is based on the positional information of the elements the vector store
  *
  * ## Create a vector of random elements on each processor 2D
- * \snippet vector_dist_unit_test.hpp Create a vector of random elements on each processor 2D
+ * \snippet Vector/tests/vector_dist_unit_test.cpp Create a vector of random elements on each processor 2D
  *
  * ## Create a vector of random elements on each processor 3D
- * \snippet vector_dist_unit_test.hpp Create a vector of random elements on each processor 3D
+ * \snippet Vector/tests/vector_dist_unit_test.cpp Create a vector of random elements on each processor 3D
  *
  * ## Create a vector of elements distributed on a grid like way
- * \snippet vector_dist_unit_test.hpp Create a vector of elements distributed on a grid like way
+ * \snippet Vector/tests/vector_dist_unit_test.cpp Create a vector of elements distributed on a grid like way
  *
  * ## Redistribute the particles and sync the ghost properties
- * \snippet vector_dist_unit_test.hpp Redistribute the particles and sync the ghost properties
+ * \snippet Vector/tests/vector_dist_unit_test.cpp Redistribute the particles and sync the ghost properties
+ *
+ * ## Create a gpu distributed vector [St = float or double]
+ * \snippet Vector/cuda/vector_dist_gpu_unit_tests.cu Create a gpu vector
+ *
+ * ## Fill a GPU vector_dist on CPU and move the information to GPU and redistribute [St = float or double]
+ * \snippet Vector/cuda/vector_dist_gpu_unit_tests.cu Fill gpu vector and move to GPU
+ *
+ * ## Fill the ghost on GPU
+ * \snippet Vector/cuda/vector_dist_gpu_unit_tests.cu Fill the ghost on GPU
  *
  * \tparam dim Dimensionality of the space where the elements lives
  * \tparam St type of space float, double ...
-- 
GitLab