From 926a696f9e4017832a8838fa4e753f4dcbd96c78 Mon Sep 17 00:00:00 2001
From: Incardona Pietro <incardon@mpi-cbg.de>
Date: Mon, 15 Mar 2021 11:50:24 +0100
Subject: [PATCH] Fixing documentation for Sparse grid

---
 .../1_gray_scott_3d_sparse/main.cpp           |  23 +-
 .../1_gray_scott_3d_sparse_gpu/main.cu        |   4 +
 .../Makefile                                  |   0
 .../config.cfg                                |   0
 .../main.cu                                   |  59 +-
 .../Makefile                                  |   0
 .../config.cfg                                |   0
 .../main.cpp                                  |   8 +-
 .../Makefile                                  |   0
 .../config.cfg                                |   0
 .../main.cpp                                  |   6 +-
 .../Makefile                                  |   0
 .../config.cfg                                |   0
 .../main.cu                                   |   0
 .../Makefile                                  |  39 ++
 .../config.cfg                                |   0
 .../3_gray_scott_3d_sparse_gpu_cs_opt/main.cu | 544 ++++++++++++++++++
 .../Makefile                                  |  26 +
 .../config.cfg                                |   2 +
 .../main.cpp                                  | 426 ++++++++++++++
 .../5_gray_scott_3d_surface_cs_opt/Makefile   |  26 +
 .../5_gray_scott_3d_surface_cs_opt/config.cfg |   2 +
 .../5_gray_scott_3d_surface_cs_opt/main.cpp   | 472 +++++++++++++++
 .../Makefile                                  |   0
 .../config.cfg                                |   2 +
 .../main.cu                                   |   0
 .../Makefile                                  |  39 ++
 .../config.cfg                                |   2 +
 .../main.cu                                   | 284 +++++++++
 .../SparseGrid/8_filling_benchmark/Makefile   |  26 +
 .../SparseGrid/8_filling_benchmark/config.cfg |   2 +
 .../SparseGrid/8_filling_benchmark/main.cpp   | 226 ++++++++
 .../8_filling_benchmark_gpu/Makefile          |  39 ++
 .../8_filling_benchmark_gpu/config.cfg        |   2 +
 .../8_filling_benchmark_gpu/main.cu           | 221 +++++++
 example/Vector/1_gpu_first_step/Makefile      |   7 +-
 example/Vector/1_gpu_first_step/main.cu       |  21 +-
 example/Vector/7_SPH_dlb_gpu/Makefile         |   7 +-
 example/Vector/7_SPH_dlb_gpu/main.cu          |  77 ++-
 39 files changed, 2498 insertions(+), 94 deletions(-)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_opt => 2_gray_scott_3d_sparse_gpu_opt}/Makefile (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_cs => 2_gray_scott_3d_sparse_gpu_opt}/config.cfg (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_opt => 2_gray_scott_3d_sparse_gpu_opt}/main.cu (75%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_opt => 2_gray_scott_3d_sparse_opt}/Makefile (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_cs => 2_gray_scott_3d_sparse_opt}/config.cfg (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_opt => 2_gray_scott_3d_sparse_opt}/main.cpp (96%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_cs => 3_gray_scott_3d_sparse_cs}/Makefile (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_opt => 3_gray_scott_3d_sparse_cs}/config.cfg (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_cs => 3_gray_scott_3d_sparse_cs}/main.cpp (98%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_cs => 3_gray_scott_3d_sparse_gpu_cs}/Makefile (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_opt => 3_gray_scott_3d_sparse_gpu_cs}/config.cfg (100%)
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_cs => 3_gray_scott_3d_sparse_gpu_cs}/main.cu (100%)
 create mode 100644 example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_opt_weak_scal => 3_gray_scott_3d_sparse_gpu_cs_opt}/config.cfg (100%)
 create mode 100644 example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu
 create mode 100644 example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile
 create mode 100644 example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg
 create mode 100644 example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp
 create mode 100644 example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile
 create mode 100644 example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg
 create mode 100644 example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_opt_weak_scal => 6_gray_scott_3d_sparse_gpu_opt_weak_scal}/Makefile (100%)
 create mode 100644 example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg
 rename example/SparseGrid/{1_gray_scott_3d_sparse_gpu_opt_weak_scal => 6_gray_scott_3d_sparse_gpu_opt_weak_scal}/main.cu (100%)
 create mode 100644 example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile
 create mode 100644 example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg
 create mode 100644 example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu
 create mode 100644 example/SparseGrid/8_filling_benchmark/Makefile
 create mode 100644 example/SparseGrid/8_filling_benchmark/config.cfg
 create mode 100644 example/SparseGrid/8_filling_benchmark/main.cpp
 create mode 100644 example/SparseGrid/8_filling_benchmark_gpu/Makefile
 create mode 100644 example/SparseGrid/8_filling_benchmark_gpu/config.cfg
 create mode 100644 example/SparseGrid/8_filling_benchmark_gpu/main.cu

diff --git a/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp b/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp
index 872edcb83..f26892a96 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp
+++ b/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp
@@ -1,9 +1,10 @@
  /*! \page SparseGrid SparseGrid
  *
  * \subpage Grid_3_gs_3D_sparse
- * \subpage Grid_3_gs_3D_sparse_cs
- * \subpage Grid_3_gs_3D_sparse_opt
  * \subpage Grid_3_gs_3D_sparse_gpu
+ * \subpage Grid_3_gs_3D_sparse_opt
+ * \subpage Grid_3_gs_3D_sparse_gpu_opt
+ * \subpage Grid_3_gs_3D_sparse_cs
  * \subpage Grid_3_gs_3D_sparse_gpu_cs
  *
  */
@@ -14,14 +15,13 @@
 
 /*!
  *
- * \page Grid_3_gs_3D_sparse Gray Scott in 3D using sparse grids
+ * \page Grid_3_gs_3D_sparse Gray Scott in\subpage Grid_3_gs_3D_sparse_opt 3D using sparse grids
  *
  * [TOC]
  *
- * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott_sparse}
+ * # Solving a gray scott-system in 3D using Sparse grids on GPU optimized # {#e3_gs_gray_scott_sparse_gpu_opt}
  *
- * This example show how to solve a Gray-Scott system in 3D using sparse grids in this case we well use a more
- * complex geometry
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu (optimized). The problem is the same as \ref Grid_3_gs_3D
  *
  * In figure is the final solution of the problem
  *
@@ -34,16 +34,7 @@
  * \see \ref Grid_3_gs_3D
  *
  *
- * We recall here the main differences between sparse and dense.
- *
- * * **get** function return now constant values, so cannot be used to get values, a get in write is an insert
- *   a get on a point position that has not been inserted return the background value
- *
- * * **insert** function create/overwrite the points value
- *
- * * **getDomainIterator** return an iterator on the existing points
- *
- * * **getGridIterator** return an iterator on the dense version of the grid
+ * 
  *
  *
  * 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu
index cd0eb3077..794f5bc08 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu
+++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu
@@ -72,8 +72,12 @@ constexpr int x = 0;
 constexpr int y = 1;
 constexpr int z = 2;
 
+//! \cond [grid definition] \endcond
+
 typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float> > SparseGridType;
 
+//! \cond [grid definition] \endcond
+
 void init(SparseGridType & grid, Box<3,float> & domain)
 {
 	//! \cond [create points] \endcond
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/Makefile b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/Makefile
rename to example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/config.cfg b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/config.cfg
rename to example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/config.cfg
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/main.cu b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu
similarity index 75%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/main.cu
rename to example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu
index 31f24b12f..7a6f2e643 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/main.cu
+++ b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu
@@ -1,3 +1,8 @@
+//#define VCLUSTER_PERF_REPORT <------ Activate telemetry for the VCluster data-structure
+//#define SYNC_BEFORE_TAKE_TIME <------ Force synchronization of the kernels everytime we take the time with the structure timer.
+//                                      Use this option for telemetry and GPU otherwise the result are unreliable                                        
+//#define ENABLE_GRID_DIST_ID_PERF_STATS  <------ Activate telementry for the grid data-structure
+
 #include "Decomposition/Distribution/BoxDistribution.hpp"
 #include "Grid/grid_dist_id.hpp"
 #include "data_type/aggregate.hpp"
@@ -5,11 +10,11 @@
 
 /*!
  *
- * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU
+ * \page Grid_3_gs_3D_sparse_gpu_opt Gray Scott in 3D using sparse grids on GPU (Optimized)
  *
  * [TOC]
  *
- * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu}
+ * # Solving a gray scott-system in 3D using Sparse grids on gpu (Optimized) # {#e3_gs_gray_scott_gpu}
  *
  * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu
  *
@@ -23,41 +28,15 @@
  *
  * \see \ref Grid_3_gs_3D
  *
- * # Initializetion
- *
- * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D)
- * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is
- * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted.
- * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k
- *
- * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters
- * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max
- * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU
- *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points
- *
- * The function can also called with a specified range
- *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub
- *
- * # Update
+ * # Optimizations
  *
- * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve
- * two properties
+ * Instead of using the default decomposition algorithm based on parmetis we use BoxDistribution. This decomposition divide the space equally 
+ * across processors. The way to use a different algorithm for decomposing the sparse grid is given by changing the type of the Sparse grid
+ * 
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu grid definition
  *
- * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen.
- *
- * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the
- * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the
- * last is the extension of the stencil. In this case we use 1.
- *
- * The lambda function is defined as
- *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda
- *
- * and used in the body loop
- *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body
+ * Because the geometry is fixed we are also using the option SKIP_LABELLING. With this option active after a normal ghost_get we are able to 
+ * activate certain optimization patterns in constructions of the sending buffers and merging data.
  *
  */
 
@@ -73,10 +52,14 @@ constexpr int x = 0;
 constexpr int y = 1;
 constexpr int z = 2;
 
+//! \cond [grid definition] \endcond
+
 typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec;
 
 typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>,CudaMemory, Dec> SparseGridType;
 
+//! \cond [grid definition] \endcond
+
 void init(SparseGridType & grid, Box<3,float> & domain)
 {
 	//! \cond [create points] \endcond
@@ -210,13 +193,13 @@ int main(int argc, char* argv[])
 
 				u_out = uc + uFactor *(u(i-1,j,k) + u(i+1,j,k) +
                                                        u(i,j-1,k) + u(i,j+1,k) +
-                                                       u(i,j,k-1) + u(i,j,k+1) - 6.0*uc) - deltaT * uc*vc*vc
-                                                       - deltaT * F * (uc - 1.0);
+                                                       u(i,j,k-1) + u(i,j,k+1) - 6.0f*uc) - deltaT * uc*vc*vc
+                                                       - deltaT * F * (uc - 1.0f);
 
 
 				v_out = vc + vFactor *(v(i-1,j,k) + v(i+1,j,k) +
                                                        v(i,j+1,k) + v(i,j-1,k) +
-                                                       v(i,j,k-1) + v(i,j,k+1) - 6.0*vc) + deltaT * uc*vc*vc
+                                                       v(i,j,k-1) + v(i,j,k+1) - 6.0f*vc) + deltaT * uc*vc*vc
 					               - deltaT * (F+K) * vc;
 				};
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile b/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile
rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/config.cfg b/example/SparseGrid/2_gray_scott_3d_sparse_opt/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/config.cfg
rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/config.cfg
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp b/example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp
similarity index 96%
rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp
rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp
index ef1df5864..ac02c2ca2 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp
+++ b/example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp
@@ -26,7 +26,7 @@
  *
  * Two optimization has been done. The first is to change the layout to struct of arrays defining the grid with
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp grid definition
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp grid definition
  *
  * The second is using the function **conv_cross2** to calculate the right-hand-side
  * this function can be used to do a convolution that involve points in a cross stencil like in figure that involve
@@ -60,11 +60,11 @@
  *
  * The lambda function is defined as
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp lambda
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp lambda
  *
  * and used in the body loop
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp body
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp body
  *
  * To note that instead of copy we split the properties where we are acting at every iteration
  *
@@ -263,7 +263,7 @@ int main(int argc, char* argv[])
 	 *
 	 * Deinitialize the library
 	 *
-	 * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp finalize
+	 * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp finalize
 	 *
 	 */
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile
rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_cs/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/config.cfg
rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/config.cfg
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp b/example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp
similarity index 98%
rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp
rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp
index 9b468421d..98444f92f 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp
@@ -51,18 +51,18 @@
  * The initialization involve the creation of 3 sphere and one cylinder channel connecting them in order to do it we
  * create an iterator over the grid (inserted and not inserted) point with **getGridIterator**
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp init sphere channel
+ * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp init sphere channel
  *
  * After creating the domain we make a perturbation in the up sphere
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp perturbation
+ * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp perturbation
  *
  * # Boundary conditions
  *
  * For this example we use mirror on direction X Y Z If the point is missing. If the point is missing in both direction than
  * the second derivative is considered zero
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp boundary condition
+ * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp boundary condition
  *
  */
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile
rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt/config.cfg
rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu
diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile
new file mode 100644
index 000000000..50373464f
--- /dev/null
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile
@@ -0,0 +1,39 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifeq (, $(shell which nvcc))
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+else
+        CUDA_CC=nvcc -ccbin=mpic++
+        CUDA_CC_LINK=nvcc -ccbin=mpic++
+endif
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg
rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg
diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu
new file mode 100644
index 000000000..71e6019c7
--- /dev/null
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu
@@ -0,0 +1,544 @@
+//#define VCLUSTER_PERF_REPORT
+//#define SYNC_BEFORE_TAKE_TIME
+//#define ENABLE_GRID_DIST_ID_PERF_STATS
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu_cs_opt Gray Scott in 3D using sparse grids on gpu in complex geometry
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu with complex geometry
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+<table border="1" bgcolor="black">
+  <tr>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs_section.png" style="width: 500px;" />
+    </td>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs.png" style="width: 500px;" />
+    </td>
+  </tr>
+</table>
+\endhtmlonly
+ *
+ * More or less this example is the same of \ref e3_gs_gray_scott_cs on gpu using what we learned in \ref e3_gs_gray_scott_gpu
+ *
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+typedef sgrid_dist_id_gpu<3,double,aggregate<double,double,double,double> > sgrid_type;
+
+void init(sgrid_type & grid, Box<3,double> & domain)
+{
+	auto it = grid.getGridIterator();
+	Point<3,double> p[8]= {{0.35,0.35,0.35},
+	                       {0.35,2.0,2.0},
+	                       {2.0,0.35,2.0},
+	                       {2.0,2.0,0.35},
+	                       {0.35,0.35,2.0},
+	                       {0.35,2.0,0.35},
+			       {2.0,0.35,0.35},
+	                       {2.0,2.0,2.0}};
+
+	
+//	Point<3,double> u({1.0,0.0,0.0});
+//	Box<3,double> channel_box(p3,p1);
+
+	double spacing_x = grid.spacing(0);
+	double spacing_y = grid.spacing(1);
+	double spacing_z = grid.spacing(2);
+
+	typedef typename GetAddBlockType<sgrid_type>::type InsertBlockT;
+
+	// Draw spheres
+	for (int i = 0 ; i < 8 ; i++)
+	{
+		Sphere<3,double> sph(p[i],0.3);
+
+		Box<3,size_t> bx;
+
+		for (int i = 0 ; i < 3 ; i++)
+		{
+			bx.setLow(i,(size_t)((sph.center(i) - 0.31)/grid.spacing(i)));
+			bx.setHigh(i,(size_t)((sph.center(i) + 0.31)/grid.spacing(i)));
+		}
+
+		grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph] __device__ (int i, int j, int k)
+                                {
+                                                Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+						// Check if the point is in the domain
+                                		if (sph.isInside(pc) )
+                                		{return true;}
+
+                                                return false;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 1.0;
+                                        data.template get<V>() = 0.0;
+                                }
+                                );
+
+		grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+		grid.removeUnusedBuffers();
+	}
+
+	//channels
+
+	Box<3,double> b({0.25,0.25,0.25},{2.1,2.1,2.1});
+
+	for (int k = 0 ; k < 3 ; k++)
+	{
+		for (int s = 0 ; s < 2 ; s++)
+		{
+			for (int i = 0 ; i < 2 ; i++)
+        		{
+				Point<3,double> u({1.0*(((s+i)%2) == 0 && k != 2),1.0*(((s+i+1)%2) == 0 && k != 2),(k == 2)*1.0});
+				Point<3,double> c({(i == 0)?0.35:2.0,(s == 0)?0.35:2.0,(k == 0)?0.35:2.0});
+
+                		Box<3,size_t> bx;
+
+                		for (int i = 0 ; i < 3 ; i++)
+                		{
+					if (c[i] == 2.0)
+					{
+						if (u[i] == 1.0)
+						{
+                                                	bx.setLow(i,(size_t)(0.34/grid.spacing(i)));
+                                                	bx.setHigh(i,(size_t)(2.01/grid.spacing(i)));
+						}
+						else
+						{
+                                                        bx.setLow(i,(size_t)((c[i] - 0.11)/grid.spacing(i)));
+                                                        bx.setHigh(i,(size_t)((c[i] + 0.11)/grid.spacing(i)));
+						}
+					}
+					else
+					{
+						if (u[i] == 1.0)
+						{
+                        				bx.setLow(i,(size_t)(0.34/grid.spacing(i)));
+                        				bx.setHigh(i,(size_t)(2.01/grid.spacing(i)));
+						}
+						else
+						{
+                                                        bx.setLow(i,(size_t)((c[i] - 0.11)/grid.spacing(i)));
+                                                        bx.setHigh(i,(size_t)((c[i] + 0.11)/grid.spacing(i)));
+						}
+					}
+                		}
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c,b] __device__ (int i, int j, int k)
+                                	{
+						Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+                                                Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+                                                Point<3,double> vp;
+
+						// shift
+						pc -= c; 
+
+                                		// calculate the distance from the diagonal
+                                		vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+                                		vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+                                		vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+						double distance = vp.norm();
+
+                                                // Check if the point is in the domain
+                                                if (distance < 0.1 && b.isInside(pcs) == true )
+                                                {return true;}
+
+                                                return false;
+                                	},
+                                	[] __device__ (InsertBlockT & data, int i, int j, int k)
+                                	{
+                                        	data.template get<U>() = 1.0;
+                                        	data.template get<V>() = 0.0;
+                                	}
+                                );
+
+                		grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+		}
+	}
+
+	// cross channel
+	
+	int s = 0;
+	for (int s = 0 ; s < 2 ; s++)
+        {
+        	for (int i = 0 ; i < 2 ; i++)
+                {	
+			Point<3,double> c({(i == 0)?0.35:2.0,(s == 0)?0.35:2.0,0.35});
+			Point<3,double> u({(i == 0)?1.0:-1.0,(s == 0)?1.0:-1.0,1.0});
+
+			Box<3,size_t> bx;
+
+			for (int k = 0 ; k < 16; k++)
+			{
+				for (int s = 0 ; s < 3 ; s++)
+				{
+					if (u[s] > 0.0)
+					{
+						bx.setLow(s,(c[s] + k*(u[s]/9.0))/grid.spacing(s) );
+						bx.setHigh(s,(c[s] + (k+3)*(u[s]/9.0) )/ grid.spacing(s) );
+					}
+					else
+					{
+						bx.setLow(s,(c[s] + (k+3)*(u[s]/9.0) )/grid.spacing(s) );
+                                                bx.setHigh(s,(c[s] + k*(u[s]/9.0))/ grid.spacing(s) );
+					}
+				}
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c,b] __device__ (int i, int j, int k)
+        			{
+                			Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+                        		Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+                        		Point<3,double> vp;
+
+                        		// shift
+                        		pc -= c;
+
+                        		// calculate the distance from the diagonal
+                        		vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+                        		vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+                        		vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+                        		double distance = vp.norm() / sqrt(3.0);
+
+                        		// Check if the point is in the domain
+                        		if (distance < 0.1 && b.isInside(pcs) == true )
+                        		{return true;}
+
+                        		return false;
+                  		},
+                  		[] __device__ (InsertBlockT & data, int i, int j, int k)
+                  		{
+                  			data.template get<U>() = 1.0;
+                        		data.template get<V>() = 0.0;
+                  		}
+        			);
+
+				grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+		}
+
+	}
+
+	long int x_start = grid.size(0)*1.95f/domain.getHigh(0);
+	long int y_start = grid.size(1)*1.95f/domain.getHigh(1);
+	long int z_start = grid.size(1)*1.95f/domain.getHigh(2);
+
+	long int x_stop = grid.size(0)*2.05f/domain.getHigh(0);
+	long int y_stop = grid.size(1)*2.05f/domain.getHigh(1);
+	long int z_stop = grid.size(1)*2.05f/domain.getHigh(2);
+
+	grid_key_dx<3> start({x_start,y_start,z_start});
+	grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+
+        grid.addPoints(start,stop,[] __device__ (int i, int j, int k)
+                                {
+                                                return true;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 0.5;
+                                        data.template get<V>() = 0.24;
+                                }
+                                );
+
+	grid.template flush<smin_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+
+	grid.removeUnusedBuffers();
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {384,384,384};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.2;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+#ifdef TEST_RUN
+        // Number of timesteps
+        size_t timeSteps = 300;
+#else
+	// Number of timesteps
+        size_t timeSteps = 50000;
+#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	sgrid_type grid(sz,domain,g,bc);
+
+	grid.template setBackgroundValue<0>(-0.5);
+	grid.template setBackgroundValue<1>(-0.5);
+	grid.template setBackgroundValue<2>(-0.5);
+	grid.template setBackgroundValue<3>(-0.5);
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du/(spacing[0]*spacing[0]);
+	double vFactor = deltaT * dv/(spacing[0]*spacing[0]);
+
+	grid.template deviceToHost<U,V>();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps; ++i)
+	{
+		//! \cond [stencil get and use] \endcond
+
+        		typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType;
+
+        		auto func = [uFactor,vFactor,deltaT,F,K] __device__ (double & u_out, double & v_out,
+        				                                   CpBlockType & u, CpBlockType & v,
+        				                                   int i, int j, int k){
+
+        				double uc = u(i,j,k);
+        				double vc = v(i,j,k);
+
+        				double u_px = u(i+1,j,k);
+        				double u_mx = u(i-1,j,k);
+
+        				double u_py = u(i,j+1,k);
+        				double u_my = u(i,j-1,k);
+
+        				double u_pz = u(i,j,k+1);
+        				double u_mz = u(i,j,k-1);
+
+        				double v_px = v(i+1,j,k);
+        				double v_mx = v(i-1,j,k);
+
+        				double v_py = v(i,j+1,k);
+        				double v_my = v(i,j-1,k);
+
+        				double v_pz = v(i,j,k+1);
+        				double v_mz = v(i,j,k-1);
+
+        				// U fix
+
+        				if (u_mx < -0.1 && u_px < -0.1)
+        				{
+        					u_mx = uc;
+        					u_px = uc;
+        				}
+
+        				if (u_mx < -0.1)
+        				{u_mx = u_px;}
+
+        				if (u_px < -0.1)
+        				{u_px = u_mx;}
+
+        				if (u_my < -0.1 && u_py < -0.1)
+        				{
+        					u_my = uc;
+        					u_py = uc;
+        				}
+
+        				if (u_my < -0.1)
+        				{u_my = u_py;}
+
+        				if (u_py < -0.1)
+        				{u_py = u_my;}
+
+        				if (u_mz < -0.1 && u_pz < -0.1)
+        				{
+        					u_mz = uc;
+        					u_pz = uc;
+        				}
+
+        				if (u_mz < -0.1)
+        				{u_mz = u_pz;}
+
+        				if (u_pz < -0.1)
+        				{u_pz = u_mz;}
+
+        				// V fix
+
+        				if (v_mx < -0.1 && v_px < -0.1)
+        				{
+        					v_mx = uc;
+        					v_px = uc;
+        				}
+
+        				if (v_mx < -0.1)
+        				{v_mx = v_px;}
+
+        				if (v_px < -0.1)
+        				{v_px = v_mx;}
+
+        				if (v_my < -0.1 && v_py < -0.1)
+        				{
+        					v_my = uc;
+        					v_py = uc;
+        				}
+
+        				if (v_my < -0.1)
+        				{v_my = v_py;}
+
+        				if (v_py < -0.1)
+        				{v_py = v_my;}
+
+        				if (v_mz < -0.1 && v_pz < -0.1)
+        				{
+        					v_mz = uc;
+        					v_pz = uc;
+        				}
+
+        				if (v_mz < -0.1)
+        				{v_mz = v_pz;}
+
+        				if (v_pz < -0.1)
+        				{v_pz = v_mz;}
+
+        				u_out = uc + uFactor *(u_mx + u_px +
+                                                               u_my + u_py +
+                                                               u_mz + u_pz - 6.0*uc) - deltaT * uc*vc*vc
+                                                               - deltaT * F * (uc - 1.0);
+
+
+        				v_out = vc + vFactor *(v_mx + v_px +
+                                                               v_py + v_my +
+                                                               v_mz + v_pz - 6.0*vc) + deltaT * uc*vc*vc
+        					               - deltaT * (F+K) * vc;
+
+        				};
+
+        		if (i % 2 == 0)
+        		{
+        			grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+				cudaDeviceSynchronize();
+
+        			// After copy we synchronize again the ghost part U and V
+
+        			grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING);
+        		}
+        		else
+        		{
+        			grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+				cudaDeviceSynchronize();
+
+        			// After copy we synchronize again the ghost part U and V
+        			grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+        		}
+
+		//! \cond [stencil get and use] \endcond
+
+		// After copy we synchronize again the ghost part U and V
+
+		// Every 500 time step we output the configuration for
+		// visualization
+/*		if (i % 500 == 0)
+		{
+			grid.save("output_" + std::to_string(count));
+			count++;
+		}*/
+
+                std::cout << "STEP: " << i  << std::endl;
+/*                if (i % 300 == 0)
+                {
+                	grid.template deviceToHost<U,V>();
+                        grid.write_frame("out",i);
+                }*/
+	}
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	grid.print_stats();
+
+	create_vcluster().print_stats();
+
+	grid.template deviceToHost<U,V>();
+	grid.write("Final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet  SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile
new file mode 100644
index 000000000..54f2679d3
--- /dev/null
+++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile
@@ -0,0 +1,26 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+gray_scott_sparse_surface_cs_test: OPT += -DTEST_RUN
+gray_scott_sparse_surface_cs_test: gray_scott_sparse_surface_cs
+
+%.o: %.cpp
+	$(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+
+gray_scott_sparse_surface_cs: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_surface_cs
+
+run: gray_scott_sparse_cs_surface_test
+	mpirun -np 4 ./gray_scott_sparse_surface_cs
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_surface_cs
+
diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg
new file mode 100644
index 000000000..1eecbac35
--- /dev/null
+++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp
new file mode 100644
index 000000000..33eea4a72
--- /dev/null
+++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp
@@ -0,0 +1,426 @@
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ *
+ */
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int phi = 2;
+constexpr int normal = 3;
+constexpr int tgrad_u = 4;
+constexpr int tgrad_v = 5;
+constexpr int U_next = 6;
+constexpr int V_next = 7;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef sgrid_dist_id<3,double,aggregate<double,double,double,double[3],double[3],double[3],double,double> > sgrid_type;
+
+void init(sgrid_type & grid, Box<3,double> & domain)
+{
+	//! \cond [init sphere channel] \endcond
+
+	auto it = grid.getGridIterator();
+	Point<3,double> p1({0.5,0.5,0.5});
+
+	double sx = grid.spacing(0);
+
+	Sphere<3,double> sph1(p1,0.3);
+	Sphere<3,double> sph2(p1,0.3 - sx*10);
+	Sphere<3,double> sph_zero(p1,0.3 - sx*5);
+
+	while (it.isNext())
+	{
+		// Get the local grid key
+		auto key = it.get_dist();
+		auto keyg = it.get();
+
+		Point<3,double> pc;
+		Point<3,double> vp;
+
+		for (int i = 0 ; i < 3 ; i++)
+        {pc.get(i) = keyg.get(i) * it.getSpacing(i);}
+
+		// Check if the point is in the first sphere
+		if (sph1.isInside(pc) == true && sph2.isInside(pc) == false)
+		{
+			Point<3,double> pn = pc - p1;
+			pn /= pn.norm();
+			double theta = acos(pn * Point<3,double>({0.0,0.0,1.0}));
+			Point<3,double> pn_ = pn;
+			pn_[2] = 0.0;
+			pn_ /= pn_.norm();
+			double aphi = acos(pn_ * Point<3,double>({1.0,0.0,0.0}));
+
+			// Create a perturbation in the solid angle
+			if (theta > 0.6 && theta < 0.8 && aphi > 0.0 && aphi < 0.2)
+			{
+				grid.template insert<U>(key) = 0.5;
+				grid.template insert<V>(key) = 0.25;
+			}
+			else
+			{
+				grid.template insert<U>(key) = 1.0;
+				grid.template insert<V>(key) = 0.0;
+			}
+			grid.template insert<phi>(key) = sph_zero.distance(pc);
+			grid.template insert<normal>(key)[0] = pn[0];
+			grid.template insert<normal>(key)[1] = pn[1];
+			grid.template insert<normal>(key)[2] = pn[2];
+
+			// Old values U and V
+			grid.template insert<U_next>(key) = 0.0;
+			grid.template insert<V_next>(key) = 0.0;
+		}
+
+		++it;
+	}
+
+	//! \cond [init sphere channel] \endcond
+}
+
+template<unsigned int U_src,unsigned int V_src,unsigned int U_dst, unsigned int V_dst>
+void extend(sgrid_type & grid)
+{
+	double delta = 1e-10;
+	double max = 0.0;
+	auto it = grid.getDomainIterator();
+
+	while (it.isNext())
+	{
+		// center point
+		auto Cp = it.get();
+
+		// plus,minus X,Y,Z
+		auto mx = Cp.move(0,-1);
+		auto px = Cp.move(0,+1);
+		auto my = Cp.move(1,-1);
+		auto py = Cp.move(1,1);
+		auto mz = Cp.move(2,-1);
+		auto pz = Cp.move(2,1);
+
+		double s = grid.get<phi>(Cp) / sqrt(fabs(grid.get<phi>(Cp)) + delta);
+
+		double Uext = 0.0;
+		double Vext = 0.0;
+
+		double dir = s*grid.get<normal>(Cp)[x];
+
+		if (dir > 0)
+		{
+			Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(mx));
+			Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(mx));
+		}
+		else if (dir < 0)
+		{
+			Uext += dir * (grid.get<U_src>(px) - grid.get<U_src>(Cp));
+			Vext += dir * (grid.get<V_src>(px) - grid.get<V_src>(Cp));
+		}
+
+
+		dir = s*grid.get<normal>(Cp)[y];
+		if (dir > 0)
+		{
+			Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(my));
+			Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(my));
+		}
+		else if (dir < 0)
+		{
+			Uext += dir * (grid.get<U_src>(py) - grid.get<U_src>(Cp));
+			Vext += dir * (grid.get<V_src>(py) - grid.get<V_src>(Cp));
+		}
+
+		dir = s*grid.get<normal>(Cp)[z];
+		if (dir > 0)
+		{
+			Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(mz));
+			Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(mz));
+		}
+		else if (dir < 0)
+		{
+			Uext += dir * (grid.get<U_src>(pz) - grid.get<U_src>(Cp));
+			Vext += dir * (grid.get<V_src>(pz) - grid.get<V_src>(Cp));
+		}
+
+		if (Uext >= max)
+		{
+			max = Uext;
+		}
+
+		grid.insert<U_dst>(Cp) = grid.get<U_src>(Cp) - 1.0*Uext;
+		grid.insert<V_dst>(Cp) = grid.get<V_src>(Cp) - 1.0*Vext;
+
+		// Next point in the grid
+		++it;
+	}
+
+	std::cout << "UEX max: " << max << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+    size_t sz[3] = {512,512,512};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.3;
+
+	// Diffusion constant for specie U
+	double du = 1*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 0.5*1e-5;
+
+//#ifdef TEST_RUN
+        // Number of timesteps
+//        size_t timeSteps = 200;
+//#else
+	// Number of timesteps
+        size_t timeSteps = 150000;
+//#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	sgrid_type grid(sz,domain,g,bc);
+
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	size_t count = 0;
+	grid.template ghost_get<U,V>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du;
+	double vFactor = deltaT * dv;
+
+	auto & v_cl = create_vcluster();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps ; ++i)
+	{
+		{
+		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// plus,minus X,Y,Z
+			auto mx = Cp.move(0,-1);
+			auto px = Cp.move(0,+1);
+			auto my = Cp.move(1,-1);
+			auto py = Cp.move(1,1);
+			auto mz = Cp.move(2,-1);
+			auto pz = Cp.move(2,1);
+
+			grid.insert<tgrad_u>(Cp)[0] = 0.0;
+			grid.insert<tgrad_u>(Cp)[1] = 0.0;
+			grid.insert<tgrad_u>(Cp)[2] = 0.0;
+			grid.insert<tgrad_v>(Cp)[0] = 0.0;
+			grid.insert<tgrad_v>(Cp)[1] = 0.0;
+			grid.insert<tgrad_v>(Cp)[2] = 0.0;
+
+			//! \cond [boundary condition] \endcond
+
+			if (grid.existPoint(mz) == true && grid.existPoint(pz) == true &&
+				grid.existPoint(my) == true && grid.existPoint(py) == true &&
+				grid.existPoint(mx) == true && grid.existPoint(px) == true )
+			{
+				Point<3,double> gradU;
+				gradU[x] = (grid.get<U>(Cp) - grid.get<U>(mx)) / grid.spacing(0);
+				gradU[y] = (grid.get<U>(Cp) - grid.get<U>(my)) / grid.spacing(1);
+				gradU[z] = (grid.get<U>(Cp) - grid.get<U>(mz)) / grid.spacing(2);
+
+				Point<3,double> gradV;
+				gradV[x] = (grid.get<V>(Cp) - grid.get<V>(mx)) / grid.spacing(0);
+				gradV[y] = (grid.get<V>(Cp) - grid.get<V>(my)) / grid.spacing(1);
+				gradV[z] = (grid.get<V>(Cp) - grid.get<V>(mz)) / grid.spacing(2);
+
+				Point<3,double> PgradU;
+				Point<3,double> PgradV;
+
+				PgradU.zero();
+				PgradV.zero();
+
+				for (int i = 0 ; i < 3 ; i++)
+				{
+					for (int j = 0 ; j < 3 ; j++)
+					{
+						grid.insert<tgrad_u>(Cp)[i] += (((i == j)?1.0:0.0) - grid.get<normal>(Cp)[i]*grid.get<normal>(Cp)[j])*gradU[j];
+						grid.insert<tgrad_v>(Cp)[i] += (((i == j)?1.0:0.0) - grid.get<normal>(Cp)[i]*grid.get<normal>(Cp)[j])*gradV[j];
+					}
+				}
+			}
+			++it;
+		}
+		}
+
+//		Old.write_frame("Init_condition",i);
+
+		{
+		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// plus,minus X,Y,Z
+			auto mx = Cp.move(0,-1);
+			auto px = Cp.move(0,+1);
+			auto my = Cp.move(1,-1);
+			auto py = Cp.move(1,1);
+			auto mz = Cp.move(2,-1);
+			auto pz = Cp.move(2,1);
+
+			//! \cond [boundary condition] \endcond
+
+			// Mirror z
+
+			if (grid.existPoint(mz) == true && grid.existPoint(pz) == true &&
+				grid.existPoint(my) == true && grid.existPoint(py) == true &&
+				grid.existPoint(mx) == true && grid.existPoint(px) == true )
+			{
+				double lapU = 0;
+				double lapV = 0;
+
+				//Div
+				lapU += (grid.get<tgrad_u>(px)[0] - grid.get<tgrad_u>(Cp)[0]) / grid.spacing(0);
+				lapV += (grid.get<tgrad_v>(px)[0] - grid.get<tgrad_v>(Cp)[0]) / grid.spacing(0);
+				lapU += (grid.get<tgrad_u>(py)[1] - grid.get<tgrad_u>(Cp)[1]) / grid.spacing(1);
+				lapV += (grid.get<tgrad_v>(py)[1] - grid.get<tgrad_v>(Cp)[1]) / grid.spacing(1);
+				lapU += (grid.get<tgrad_u>(pz)[2] - grid.get<tgrad_u>(Cp)[2]) / grid.spacing(2);
+				lapV += (grid.get<tgrad_v>(pz)[2] - grid.get<tgrad_v>(Cp)[2]) / grid.spacing(2);
+
+				// update based on Eq 2
+				grid.insert<U_next>(Cp) = grid.get<U>(Cp) + uFactor * lapU +
+											- deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) +
+											- deltaT * F * (grid.get<U>(Cp) - 1.0);
+
+
+				// update based on Eq 2
+				grid.insert<V_next>(Cp) = grid.get<V>(Cp) + vFactor * lapV +
+											deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) +
+											- deltaT * (F+K) * grid.get<V>(Cp);
+			}
+
+			// Next point in the grid
+			++it;
+		}
+		}
+
+//		New.write_frame("update",i);
+
+		// Extend
+
+		if (i % 5 == 0)
+		{
+		for (int j = 0 ; j < 2 ; j++)
+		{
+			if (j % 2 == 0)
+			{extend<U_next,V_next,U,V>(grid);}
+			else
+			{extend<U,V,U_next,V_next>(grid);}
+
+			// Here we copy New into the old grid in preparation of the new step
+			// It would be better to alternate, but using this we can show the usage
+			// of the function copy. To note that copy work only on two grid of the same
+			// decomposition. If you want to copy also the decomposition, or force to be
+			// exactly the same, use Old = New
+			//New.copy_sparse(Old);
+		}
+		}
+
+/*		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// update based on Eq 2
+			grid.insert<U>(Cp) = grid.get<U_next>(Cp);
+			grid.insert<V>(Cp) = grid.get<V_next>(Cp);
+
+			++it;
+		}*/
+
+		//! \cond [stencil get and use] \endcond
+
+		// After copy we synchronize again the ghost part U and V
+		grid.ghost_get<U,V>();
+
+		// Every 500 time step we output the configuration for
+		// visualization
+		if (i % 500 == 0)
+		{
+			grid.save("output_" + std::to_string(count));
+			count++;
+		}
+
+		if (v_cl.rank() == 0)
+		{std::cout << "STEP: " << i  << "   " << std::endl;}
+		if (i % 100 == 0)
+		{
+			grid.write_frame("out",i);
+		}
+	}
+
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile
new file mode 100644
index 000000000..d6018ae5d
--- /dev/null
+++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile
@@ -0,0 +1,26 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+gray_scott_sparse_surface_cs_test: OPT += -DTEST_RUN
+gray_scott_sparse_surface_cs_test: gray_scott_sparse_surface_cs
+
+%.o: %.cpp
+	$(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
+
+gray_scott_sparse_surface_cs: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_surface_cs
+
+run: gray_scott_sparse_cs_surface_test
+	mpirun -np 4 ./gray_scott_sparse_surface_cs
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_surface_cs
+
diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg
new file mode 100644
index 000000000..1eecbac35
--- /dev/null
+++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp
new file mode 100644
index 000000000..8dba1fcdf
--- /dev/null
+++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp
@@ -0,0 +1,472 @@
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ */
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int phi = 2;
+constexpr int normal = 3;
+constexpr int tgrad_u = 4;
+constexpr int tgrad_v = 5;
+constexpr int U_next = 6;
+constexpr int V_next = 7;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef sgrid_dist_soa<3,double,aggregate<double,double,double,double[3],double[3],double[3],double,double> > sgrid_type;
+
+void init(sgrid_type & grid, Box<3,double> & domain)
+{
+	//! \cond [init sphere channel] \endcond
+
+	auto it = grid.getGridIterator();
+	Point<3,double> p1({0.5,0.5,0.5});
+
+	double sx = grid.spacing(0);
+
+	Sphere<3,double> sph1(p1,0.3);
+	Sphere<3,double> sph2(p1,0.3 - sx*10);
+	Sphere<3,double> sph_zero(p1,0.3 - sx*5);
+
+	while (it.isNext())
+	{
+		// Get the local grid key
+		auto key = it.get_dist();
+		auto keyg = it.get();
+
+		Point<3,double> pc;
+		Point<3,double> vp;
+
+		for (int i = 0 ; i < 3 ; i++)
+        {pc.get(i) = keyg.get(i) * it.getSpacing(i);}
+
+		// Check if the point is in the first sphere
+		if (sph1.isInside(pc) == true && sph2.isInside(pc) == false)
+		{
+			Point<3,double> pn = pc - p1;
+			pn /= pn.norm();
+			double theta = acos(pn * Point<3,double>({0.0,0.0,1.0}));
+			Point<3,double> pn_ = pn;
+			pn_[2] = 0.0;
+			pn_ /= pn_.norm();
+			double aphi = acos(pn_ * Point<3,double>({1.0,0.0,0.0}));
+
+			// Create a perturbation in the solid angle
+			if (theta > 0.6 && theta < 0.8 && aphi > 0.0 && aphi < 0.2)
+			{
+				grid.template insert<U>(key) = 0.5;
+				grid.template insert<V>(key) = 0.25;
+			}
+			else
+			{
+				grid.template insert<U>(key) = 1.0;
+				grid.template insert<V>(key) = 0.0;
+			}
+			grid.template insert<phi>(key) = sph_zero.distance(pc);
+			grid.template insert<normal>(key)[0] = pn[0];
+			grid.template insert<normal>(key)[1] = pn[1];
+			grid.template insert<normal>(key)[2] = pn[2];
+
+			// Old values U and V
+			grid.template insert<U_next>(key) = 0.0;
+			grid.template insert<V_next>(key) = 0.0;
+		}
+
+		++it;
+	}
+
+	//! \cond [init sphere channel] \endcond
+}
+
+template<unsigned int U_src,unsigned int V_src,unsigned int U_dst, unsigned int V_dst>
+void extend(sgrid_type & grid, size_t (& sz)[3],double (& spacing)[3])
+{
+	double delta = 1e-10;
+	double max = 0.0;
+
+	auto func_extend = [&grid,delta,&spacing](auto & grid, auto & ids,
+	                                 unsigned char * mask_sum)
+									 {
+										Vc::double_v phi_c;
+										Vc::double_v s;
+
+										Vc::double_v Uext = 0.0;
+										Vc::double_v Vext = 0.0;
+
+										Vc::double_v n[3];
+										Vc::double_v dir;
+
+										Vc::double_v Uc;
+										Vc::double_v Vc;
+										Vc::double_v Uc_xm;
+										Vc::double_v Vc_xm;
+										Vc::double_v Uc_ym;
+										Vc::double_v Vc_ym;
+										Vc::double_v Uc_zm;
+										Vc::double_v Vc_zm;
+
+										Vc::double_v Uc_xp;
+										Vc::double_v Vc_xp;
+										Vc::double_v Uc_yp;
+										Vc::double_v Vc_yp;
+										Vc::double_v Uc_zp;
+										Vc::double_v Vc_zp;
+
+										load_crs<x,0,phi>(phi_c,grid,ids);
+										load_crs_v<x,0,x,normal>(n[x],grid,ids);
+										load_crs_v<x,0,y,normal>(n[y],grid,ids);
+										load_crs_v<x,0,z,normal>(n[z],grid,ids);
+
+										load_crs<x,0,U_src>(Uc,grid,ids);
+										load_crs<x,0,V_src>(Vc,grid,ids);
+										load_crs<x,-1,U_src>(Uc_xm,grid,ids);
+										load_crs<x,-1,V_src>(Vc_xm,grid,ids);
+										load_crs<y,-1,U_src>(Uc_ym,grid,ids);
+										load_crs<y,-1,V_src>(Vc_ym,grid,ids);
+										load_crs<z,-1,U_src>(Uc_zm,grid,ids);
+										load_crs<z,-1,V_src>(Vc_zm,grid,ids);
+										load_crs<x,1,U_src>(Uc_xp,grid,ids);
+										load_crs<x,1,V_src>(Vc_xp,grid,ids);
+										load_crs<y,1,U_src>(Uc_yp,grid,ids);
+										load_crs<y,1,V_src>(Vc_yp,grid,ids);
+										load_crs<z,1,U_src>(Uc_zp,grid,ids);
+										load_crs<z,1,V_src>(Vc_zp,grid,ids);
+
+										s = phi_c / sqrt(phi_c*phi_c + delta*delta);
+
+										dir = s*n[0];
+										auto dir_pos = dir > 0;
+										auto dir_neg = dir < 0;
+
+										Uext += Vc::iif(dir_pos,dir * (Uc - Uc_xm)/spacing[0],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_pos,dir * (Vc - Vc_xm)/spacing[0],Vc::double_v(0.0));
+										Uext += Vc::iif(dir_neg,dir * (Uc_xp - Uc)/spacing[0],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_neg,dir * (Vc_xp - Vc)/spacing[0],Vc::double_v(0.0));
+
+										dir = s*n[1];
+										dir_pos = dir > 0;
+										dir_neg = dir < 0;
+
+										Uext += Vc::iif(dir_pos,dir * (Uc - Uc_ym)/spacing[1],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_pos,dir * (Vc - Vc_ym)/spacing[1],Vc::double_v(0.0));
+										Uext += Vc::iif(dir_neg,dir * (Uc_yp - Uc)/spacing[1],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_neg,dir * (Vc_yp - Vc)/spacing[1],Vc::double_v(0.0));
+
+										dir = s*n[2];
+										dir_pos = dir > 0;
+										dir_neg = dir < 0;
+
+										Uext += Vc::iif(dir_pos,dir * (Uc - Uc_zm)/spacing[2],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_pos,dir * (Vc - Vc_zm)/spacing[2],Vc::double_v(0.0));
+										Uext += Vc::iif(dir_neg,dir * (Uc_zp - Uc)/spacing[2],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_neg,dir * (Vc_zp - Vc)/spacing[2],Vc::double_v(0.0));
+
+										Uext = Uc - 0.0003*Uext;
+										Vext = Vc - 0.0003*Vext;
+
+										store_crs<U_dst>(grid,Uext,ids);
+										store_crs<V_dst>(grid,Vext,ids);
+									 };
+
+		grid.template conv_cross_ids<1,double>({0,0,0},{sz[0] - 1, sz[1] - 1, sz[2] - 1},func_extend);
+}
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+    size_t sz[3] = {512,512,512};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.3;
+
+	// Diffusion constant for specie U
+	double du = 1*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 0.5*1e-5;
+
+//#ifdef TEST_RUN
+        // Number of timesteps
+//        size_t timeSteps = 200;
+//#else
+	// Number of timesteps
+        size_t timeSteps = 100000;
+//#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	sgrid_type grid(sz,domain,g,bc);
+
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	size_t count = 0;
+	grid.template ghost_get<U,V>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du;
+	double vFactor = deltaT * dv;
+
+	auto & v_cl = create_vcluster();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps ; ++i)
+	{
+		auto func_grad = [&grid,&spacing](auto & grid, auto & ids,
+                                 unsigned char * mask_sum){
+
+												Vc::double_v n[3];
+
+												Vc::double_v Uc;
+												Vc::double_v xmU;
+												Vc::double_v ymU;
+												Vc::double_v zmU;
+
+												Vc::double_v Vc;
+												Vc::double_v xmV;
+												Vc::double_v ymV;
+												Vc::double_v zmV;
+
+												Vc::double_v u_out[3];
+												Vc::double_v v_out[3];
+
+												load_crs<x,-1,U>(xmU,grid,ids);
+												load_crs<y,-1,U>(ymU,grid,ids);
+												load_crs<z,-1,U>(zmU,grid,ids);
+												load_crs<x,0,U>(Uc,grid,ids);
+
+												load_crs<x,-1,V>(xmV,grid,ids);
+												load_crs<y,-1,V>(ymV,grid,ids);
+												load_crs<z,-1,V>(zmV,grid,ids);
+												load_crs<x,0,V>(Vc,grid,ids);
+
+												load_crs_v<x,0,x,normal>(n[x],grid,ids);
+												load_crs_v<x,0,y,normal>(n[y],grid,ids);
+												load_crs_v<x,0,z,normal>(n[z],grid,ids);
+
+												u_out[0] = (1.0-n[0]*n[0])*(Uc - xmU)/spacing[0]  + (-n[1]*n[1])*(Uc - ymU)/spacing[1]    + (-n[2]*n[2])*(Uc - zmU)/spacing[2];
+												u_out[1] = (-n[0]*n[0])*(Uc - xmU)/spacing[0]    + (1.0-n[1]*n[1])*(Uc - ymU)/spacing[1] + (-n[2]*n[2])*(Uc - zmU)/spacing[2];
+												u_out[2] = (-n[0]*n[0])*(Uc - xmU)/spacing[0]    + (-n[1]*n[1])*(Uc - ymU)/spacing[1]    + (1.0-n[2]*n[2])*(Uc - zmU)/spacing[2];
+
+												v_out[0] = (1.0-n[0]*n[0])*(Vc - xmV)/spacing[0] + (-n[1]*n[1])*(Vc - ymV)/spacing[1]    + (-n[2]*n[2])*(Vc - zmV)/spacing[2];
+												v_out[1] = (-n[0]*n[0])*(Vc - xmV)/spacing[0]    + (1.0-n[1]*n[1])*(Vc - ymV)/spacing[1] + (-n[2]*n[2])*(Vc - zmV)/spacing[2];
+												v_out[2] = (-n[0]*n[0])*(Vc - xmV)/spacing[0]    + (-n[1]*n[1])*(Vc - ymV)/spacing[1]    + (1.0-n[2]*n[2])*(Vc - zmV)/spacing[2];
+
+												Vc::Mask<double> surround;
+
+												for (int i = 0 ; i < Vc::double_v::Size ; i++)
+												{surround[i] = (mask_sum[i] == 6);}
+
+												u_out[0] = Vc::iif(surround,u_out[0],Vc::double_v(0.0));
+												u_out[1] = Vc::iif(surround,u_out[1],Vc::double_v(0.0));
+												u_out[2] = Vc::iif(surround,u_out[2],Vc::double_v(0.0));
+
+												v_out[0] = Vc::iif(surround,v_out[0],Vc::double_v(0.0));
+												v_out[1] = Vc::iif(surround,v_out[1],Vc::double_v(0.0));
+												v_out[2] = Vc::iif(surround,v_out[2],Vc::double_v(0.0));
+
+												store_crs_v<tgrad_u,x>(grid,u_out[0],ids);
+												store_crs_v<tgrad_u,y>(grid,u_out[1],ids);
+												store_crs_v<tgrad_u,z>(grid,u_out[2],ids);
+
+												store_crs_v<tgrad_v,x>(grid,v_out[0],ids);
+												store_crs_v<tgrad_v,y>(grid,v_out[1],ids);
+												store_crs_v<tgrad_v,z>(grid,v_out[2],ids);
+											};
+
+		grid.template conv_cross_ids<1,double>({0,0,0},{sz[0]-1,sz[1] - 1,sz[2] - 1},func_grad);
+
+		auto func_lap = [&grid,&spacing,uFactor,vFactor,deltaT,K,F](auto & grid, auto & ids,
+                                 unsigned char * mask_sum){
+
+												Vc::double_v gradU_px;
+												Vc::double_v gradU_py;
+												Vc::double_v gradU_pz;
+
+												Vc::double_v gradU_x;
+												Vc::double_v gradU_y;
+												Vc::double_v gradU_z;
+
+												Vc::double_v gradV_px;
+												Vc::double_v gradV_py;
+												Vc::double_v gradV_pz;
+
+												Vc::double_v gradV_x;
+												Vc::double_v gradV_y;
+												Vc::double_v gradV_z;
+
+												Vc::double_v lapU;
+												Vc::double_v lapV;
+
+												Vc::double_v Uc;
+												Vc::double_v Vc;
+
+												Vc::double_v outU;
+												Vc::double_v outV;
+
+												load_crs_v<x,1,x,tgrad_u>(gradU_px,grid,ids);
+												load_crs_v<y,1,y,tgrad_u>(gradU_py,grid,ids);
+												load_crs_v<z,1,z,tgrad_u>(gradU_pz,grid,ids);
+
+												load_crs_v<x,0,x,tgrad_u>(gradU_x,grid,ids);
+												load_crs_v<x,0,y,tgrad_u>(gradU_y,grid,ids);
+												load_crs_v<x,0,z,tgrad_u>(gradU_z,grid,ids);
+
+												load_crs_v<x,1,x,tgrad_v>(gradV_px,grid,ids);
+												load_crs_v<y,1,y,tgrad_v>(gradV_py,grid,ids);
+												load_crs_v<z,1,z,tgrad_v>(gradV_pz,grid,ids);
+
+												load_crs_v<x,0,x,tgrad_v>(gradV_x,grid,ids);
+												load_crs_v<x,0,y,tgrad_v>(gradV_y,grid,ids);
+												load_crs_v<x,0,z,tgrad_v>(gradV_z,grid,ids);
+
+												load_crs<x,0,U>(Uc,grid,ids);
+												load_crs<x,0,V>(Vc,grid,ids);
+
+												lapU += (gradU_px - gradU_x) / spacing[0];
+												lapV += (gradV_px - gradV_x) / spacing[0];
+												lapU += (gradU_py - gradU_y) / spacing[1];
+												lapV += (gradV_py - gradV_y) / spacing[1];
+												lapU += (gradU_pz - gradU_z) / spacing[2];
+												lapV += (gradV_pz - gradV_z) / spacing[2];
+
+												// update based on Eq 2
+												outU = Uc + uFactor * lapU +
+																			- deltaT * Uc * Vc * Vc +
+																			- deltaT * F * (Uc - 1.0);
+
+
+												// update based on Eq 2
+												outV = Vc + vFactor * lapV +
+																			deltaT * Uc * Vc * Vc +
+																			- deltaT * (F+K) * Vc;
+
+												Vc::Mask<double> surround;
+
+												for (int i = 0 ; i < Vc::double_v::Size ; i++)
+												{surround[i] = (mask_sum[i] == 6);}
+
+
+												outU = Vc::iif(surround,outU,Uc);
+												outV = Vc::iif(surround,outV,Vc);
+
+												store_crs<U_next>(grid,outU,ids);
+												store_crs<V_next>(grid,outV,ids);
+											};
+
+		grid.template conv_cross_ids<1,double>({0,0,0},{sz[0]-1,sz[1] - 1,sz[2] - 1},func_lap);
+
+//		New.write_frame("update",i);
+
+		// Extend
+
+		if (i % 5 == 0)
+		{
+		for (int j = 0 ; j < 2 ; j++)
+		{
+			if (j % 2 == 0)
+			{extend<U_next,V_next,U,V>(grid,sz,spacing);}
+			else
+			{extend<U,V,U_next,V_next>(grid,sz,spacing);}
+
+			// Here we copy New into the old grid in preparation of the new step
+			// It would be better to alternate, but using this we can show the usage
+			// of the function copy. To note that copy work only on two grid of the same
+			// decomposition. If you want to copy also the decomposition, or force to be
+			// exactly the same, use Old = New
+			//New.copy_sparse(Old);
+		}
+		}
+
+/*		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// update based on Eq 2
+			grid.insert<U>(Cp) = grid.get<U_next>(Cp);
+			grid.insert<V>(Cp) = grid.get<V_next>(Cp);
+
+			++it;
+		}*/
+
+		//! \cond [stencil get and use] \endcond
+
+		// After copy we synchronize again the ghost part U and V
+		grid.ghost_get<U,V>();
+
+		// Every 500 time step we output the configuration for
+		// visualization
+		if (i % 500 == 0)
+		{
+//			grid.save("output_" + std::to_string(count));
+			count++;
+		}
+
+		if (v_cl.rank() == 0)
+		{std::cout << "STEP: " << i  << "   " << std::endl;}
+		if (i % 1000 == 0)
+		{
+			grid.write_frame("out",i);
+		}
+	}
+
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile
rename to example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile
diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg
new file mode 100644
index 000000000..699be429e
--- /dev/null
+++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu
rename to example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu
diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile
new file mode 100644
index 000000000..50373464f
--- /dev/null
+++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile
@@ -0,0 +1,39 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifeq (, $(shell which nvcc))
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+else
+        CUDA_CC=nvcc -ccbin=mpic++
+        CUDA_CC_LINK=nvcc -ccbin=mpic++
+endif
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg
new file mode 100644
index 000000000..699be429e
--- /dev/null
+++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu
new file mode 100644
index 000000000..04904f393
--- /dev/null
+++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu
@@ -0,0 +1,284 @@
+#define SYNC_BEFORE_TAKE_TIME
+#include "Decomposition/Distribution/BoxDistribution.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu with complex geometry
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+<table border="1" bgcolor="black">
+  <tr>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs_section.png" style="width: 500px;" />
+    </td>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs.png" style="width: 500px;" />
+    </td>
+  </tr>
+</table>
+\endhtmlonly
+ *
+ * More or less this example is the same of \ref e3_gs_gray_scott_cs on gpu using what we learned in \ref e3_gs_gray_scott_gpu
+ *
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+typedef CartDecomposition<3,double, CudaMemory, memory_traits_inte, BoxDistribution<3,double> > Dec;
+
+typedef sgrid_dist_id_gpu<3,double,aggregate<double,double,double,double>, CudaMemory,Dec > sgrid_type;
+
+void draw_oscillation_shock(sgrid_type & grid, Box<3,double> & domain)
+{
+	auto it = grid.getGridIterator();
+	Point<3,double> p({1.25,1.25,1.25});
+
+	
+//	Point<3,double> u({1.0,0.0,0.0});
+//	Box<3,double> channel_box(p3,p1);
+
+	double spacing_x = grid.spacing(0);
+	double spacing_y = grid.spacing(1);
+	double spacing_z = grid.spacing(2);
+
+	typedef typename GetAddBlockType<sgrid_type>::type InsertBlockT;
+
+	// Draw a shock expanding from 0.4 to 0.8 and than contracting from 0.8 to 0.4
+	for (int i = 0 ; i < 100 ; i++)
+	{
+		Sphere<3,double> sph(p,0.2 + (double)i/160.0);
+		Sphere<3,double> sph2(p,0.4 + (double)i/160.0);
+
+		Box<3,size_t> bx;
+
+		for (int j = 0 ; j < 3 ; j++)
+		{
+			bx.setLow(j,(size_t)((sph.center(j) - 0.4 - (double)i/160.0)/grid.spacing(j)));
+			bx.setHigh(j,(size_t)((sph.center(j) + 0.4 + (double)i/160.0)/grid.spacing(j)));
+		}
+
+		timer t_add;
+		t_add.start();
+
+		grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph,sph2] __device__ (int i, int j, int k)
+                                {
+                                                Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+						// Check if the point is in the domain
+                                		if (sph2.isInside(pc) )
+                                		{
+							if (sph.isInside(pc) == false)
+							{return true;}
+						}
+
+                                                return false;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 1.0;
+                                        data.template get<V>() = 0.0;
+                                }
+                                );
+
+		t_add.stop();
+
+		timer t_flush;
+                t_flush.start();
+                grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+                t_flush.stop();
+
+                timer t_ghost;
+                t_ghost.start();
+                grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+                t_ghost.stop();
+                timer t_ghost2;
+                t_ghost2.start();
+                grid.template ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+                t_ghost2.stop();
+                std::cout << t_ghost.getwct() << std::endl;
+
+                std::cout << "TIME ghost1: " << t_ghost.getwct() << "  ghost2: " << t_ghost2.getwct()  << " flush: " <<  t_flush.getwct() << " " << std::endl;
+
+
+		grid.removeUnusedBuffers();
+
+	}
+
+	std::cout << "Second Pass" <<std::endl;
+
+	for (int i = 0 ; i < 100 ; i++)
+	{
+		Sphere<3,double> sph(p,0.2 + (double)i/160.0);
+		Sphere<3,double> sph2(p,0.4 + (double)i/160.0);
+
+		Box<3,size_t> bx;
+
+		for (int j = 0 ; j < 3 ; j++)
+		{
+			bx.setLow(j,(size_t)((sph.center(j) - 0.4 - (double)i/160.0)/grid.spacing(j)));
+			bx.setHigh(j,(size_t)((sph.center(j) + 0.4 + (double)i/160.0)/grid.spacing(j)));
+		}
+
+		timer t_add;
+		t_add.start();
+
+		grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph,sph2] __device__ (int i, int j, int k)
+                                {
+                                                Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+						// Check if the point is in the domain
+                                		if (sph2.isInside(pc) )
+                                		{
+							if (sph.isInside(pc) == false)
+							{return true;}
+						}
+
+                                                return false;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 1.0;
+                                        data.template get<V>() = 0.0;
+                                }
+                                );
+
+		t_add.stop();
+
+
+		timer t_flush;
+                t_flush.start();
+		grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+		t_flush.stop();
+//		grid.removeUnusedBuffers();
+
+
+		timer t_ghost;
+		t_ghost.start();
+		grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+		t_ghost.stop();
+		timer t_ghost2;
+                t_ghost2.start();
+                grid.template ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+                t_ghost2.stop();
+
+		std::cout << "TIME ghost1: " << t_ghost.getwct() << "  ghost2: " << t_ghost2.getwct()  << " flush: " <<  t_flush.getwct() << " " << std::endl;
+
+//		if (i % 10 == 0)
+//		{
+//			grid.template deviceToHost<U,V>();
+//        		grid.write_frame("Final",i);
+//		}
+	}
+
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {384,384,384};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.025;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+#ifdef TEST_RUN
+        // Number of timesteps
+        size_t timeSteps = 300;
+#else
+	// Number of timesteps
+        size_t timeSteps = 50000;
+#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	grid_sm<3,void> gv({3,1,1});
+
+	sgrid_type grid(sz,domain,g,bc,0,gv);
+
+	grid.template setBackgroundValue<0>(-0.5);
+	grid.template setBackgroundValue<1>(-0.5);
+	grid.template setBackgroundValue<2>(-0.5);
+	grid.template setBackgroundValue<3>(-0.5);
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	draw_oscillation_shock(grid,domain);
+
+	grid.template deviceToHost<U,V>();
+	grid.write("Final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet  SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/SparseGrid/8_filling_benchmark/Makefile b/example/SparseGrid/8_filling_benchmark/Makefile
new file mode 100644
index 000000000..5bcf5675e
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark/Makefile
@@ -0,0 +1,26 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+gray_scott_sparse_opt_test: OPT += -DTEST_RUN
+gray_scott_sparse_opt_test: gray_scott_sparse_opt
+
+%.o: %.cpp
+	$(CC) -mavx -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+
+gray_scott_sparse_opt: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_opt
+
+run: gray_scott_sparse_opt_test
+	mpirun -np 4 ./gray_scott_sparse_opt
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_opt
+
diff --git a/example/SparseGrid/8_filling_benchmark/config.cfg b/example/SparseGrid/8_filling_benchmark/config.cfg
new file mode 100644
index 000000000..1eecbac35
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/SparseGrid/8_filling_benchmark/main.cpp b/example/SparseGrid/8_filling_benchmark/main.cpp
new file mode 100644
index 000000000..ba0e077f0
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark/main.cpp
@@ -0,0 +1,226 @@
+
+#include "util/cuda/cuda_launch.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using sparse grids optimized on CPU # {#e3_gs_gray_scott_opt}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids in an optimized way
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * More or less this example is the adaptation of the dense example in 3D
+ *
+ * \see \ref Grid_3_gs_3D
+ *
+ * This example is the same as \ref e3_gs_gray_scott_sparse the difference is optimizing for speed.
+ *
+ * Two optimization has been done. The first is to change the layout to struct of arrays defining the grid with
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp grid definition
+ *
+ * The second is using the function **conv_cross2** to calculate the right-hand-side
+ * this function can be used to do a convolution that involve points in a cross stencil like in figure that involve
+ * two properties
+ *
+\verbatim
+
+     *
+     *
+ * * x * *
+     *
+     *
+
+\endverbatim
+ *
+ * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use float
+ * we have to use Vc::float_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is
+ * now integrated in openfpm.
+ *
+ *\htmlonly
+ * <a href="https://github.com/VcDevel/Vc" >Vc Library</a>
+ *\endhtmlonly
+ *
+ * Vc::double_v in general pack 1,2,4 doubles dependently from the fact we choose to activate no-SSE,SSE or AVX at compiler level.
+ * The arguments 3 and 4 contain the properties of two selected properties in the cross pattern given by xm xp ym yp zm zp.
+ * The last arguments is instead the mask. The mask can be accessed to check the number of existing points. For example if
+ * we have a cross stencil in 3D with stencil size = 1 than we expect 6 points. Note that the mask is an array because if Vc::double_v
+ * contain 4 doubles than the mask has 4 elements accessed with the array operator []. The call **cross_conv2** also accept
+ * template parameters the first two indicate the source porperties, the other two are the destination properties. While the
+ * last is the extension of the stencil. In this case we use 1.
+ *
+ * The lambda function is defined as
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp lambda
+ *
+ * and used in the body loop
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp body
+ *
+ * To note that instead of copy we split the properties where we are acting at every iteration
+ *
+ */
+
+constexpr int U = 0;
+constexpr int V = 1;
+
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+void init(sgrid_dist_soa<3,double,aggregate<double,double,double,double> > & grid, Box<3,double> & domain)
+{
+	for (int i = 0 ; i < 10 ; i++)
+	{
+		timer t;
+		t.start();
+
+		auto it = grid.getGridIterator();
+
+		while (it.isNext())
+		{
+			// Get the local grid key
+			auto key = it.get_dist();
+
+			// Old values U and V
+			grid.template insert<U>(key) = 1.0;
+
+			++it;
+		}
+
+		t.stop();
+		std::cout << "Time populate: " << t.getwct() << std::endl;
+
+		grid.clear();
+
+                timer t2;
+                t2.start();
+
+                auto it2 = grid.getGridIterator();
+
+                while (it2.isNext())
+                {
+                        // Get the local grid key
+                        auto key = it2.get_dist();
+
+                        // Old values U and V
+                        grid.template insert<U>(key) = 5.0;
+         
+                        ++it2;
+                }
+
+		t2.stop();
+                std::cout << "Time populate: " << t2.getwct() << std::endl;
+
+	}
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {256,256,256};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.25;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+	// Number of timesteps
+#ifdef TEST_RUN
+	size_t timeSteps = 200;
+#else
+        size_t timeSteps = 5000;
+#endif
+
+	// K and F (Physical constant in the equation)
+    double K = 0.053;
+    double F = 0.014;
+
+    //! \cond [grid definition] \endcond
+
+	sgrid_dist_soa<3, double, aggregate<double,double,double,double>> grid(sz,domain,g,bc);
+
+    //! \cond [grid definition] \endcond
+
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	size_t count = 0;
+	grid.template ghost_get<U,V>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du/(spacing[x]*spacing[x]);
+	double vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+	timer tot_sim;
+	tot_sim.start();
+
+	auto & v_cl = create_vcluster();
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	grid.write("final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp
+	 *
+	 */
+}
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/Makefile b/example/SparseGrid/8_filling_benchmark_gpu/Makefile
new file mode 100644
index 000000000..450a40d1c
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark_gpu/Makefile
@@ -0,0 +1,39 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifeq (, $(shell which nvcc))
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+else
+        CUDA_CC=nvcc -ccbin=mpic++
+        CUDA_CC_LINK=nvcc -ccbin=mpic++
+endif
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) -use_fast_math  -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/config.cfg b/example/SparseGrid/8_filling_benchmark_gpu/config.cfg
new file mode 100644
index 000000000..699be429e
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark_gpu/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/main.cu b/example/SparseGrid/8_filling_benchmark_gpu/main.cu
new file mode 100644
index 000000000..524b5790f
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark_gpu/main.cu
@@ -0,0 +1,221 @@
+#define VCLUSTER_PERF_REPORT
+#define SYNC_BEFORE_TAKE_TIME
+#define ENABLE_GRID_DIST_ID_PERF_STATS
+#include "Decomposition/Distribution/BoxDistribution.hpp"
+#include "util/cuda/cuda_launch.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * More or less this example is the adaptation of the dense example in 3D
+ *
+ * \see \ref Grid_3_gs_3D
+ *
+ * # Initializetion
+ *
+ * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D)
+ * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is
+ * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted.
+ * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k
+ *
+ * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters
+ * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max
+ * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points
+ *
+ * The function can also called with a specified range
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub
+ *
+ * # Update
+ *
+ * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve
+ * two properties
+ *
+ * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen.
+ *
+ * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the
+ * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the
+ * last is the extension of the stencil. In this case we use 1.
+ *
+ * The lambda function is defined as
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda
+ *
+ * and used in the body loop
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec;
+
+typedef sgrid_dist_id_gpu<3,float,aggregate<float>,CudaMemory, Dec> SparseGridType;
+
+void init(SparseGridType & grid, Box<3,float> & domain)
+{
+	//! \cond [create points] \endcond
+
+	typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT;
+
+	for (int i = 0 ; i < 10 ; i++)
+	{
+		timer t;
+		t.start();
+
+		grid.addPoints([] __device__ (int i, int j, int k)
+			        {
+						return true;
+			        },
+			        [] __device__ (InsertBlockT & data, int i, int j, int k)
+			        {
+			        	data.template get<U>() = 1.0;
+			        }
+			        );
+
+
+		grid.template flush<smax_<U>>(flush_type::FLUSH_ON_DEVICE);
+
+		t.stop();
+
+		std::cout << "Time populate: " << t.getwct()  << std::endl;
+
+        	timer t2;
+		cudaDeviceSynchronize();
+        	t2.start();
+
+        	grid.addPoints([] __device__ (int i, int j, int k)
+                                {
+                                                return true;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 5.0;
+                                }
+                                );
+
+
+        	grid.template flush<sRight_<U>>(flush_type::FLUSH_ON_DEVICE);
+
+		t2.stop();
+
+		std::cout << "Time populate: " << t2.getwct()  << std::endl;
+	}
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {512,512,512};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	float deltaT = 0.25;
+
+	// Diffusion constant for specie U
+	float du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	float dv = 1*1e-5;
+
+	// Number of timesteps
+#ifdef TEST_RUN
+	size_t timeSteps = 300;
+#else
+        size_t timeSteps = 15000;
+#endif
+
+	// K and F (Physical constant in the equation)
+    float K = 0.053;
+    float F = 0.014;
+
+	SparseGridType grid(sz,domain,g,bc);
+
+	// spacing of the grid on x and y
+	float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	grid.deviceToHost<U>();
+	grid.write("final");
+	grid.print_stats();
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/Vector/1_gpu_first_step/Makefile b/example/Vector/1_gpu_first_step/Makefile
index 1599114c9..30dbf24e5 100644
--- a/example/Vector/1_gpu_first_step/Makefile
+++ b/example/Vector/1_gpu_first_step/Makefile
@@ -4,6 +4,8 @@ CUDA_CC=
 ifdef CUDA_ON_CPU
 	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
 else
 	ifeq (, $(shell which nvcc))
         	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
@@ -11,6 +13,7 @@ else
 	else
         	CUDA_CC=nvcc -ccbin=mpic++
 	endif
+	LIBS_SELECT=$(LIBS)
 endif
 
 
@@ -21,13 +24,13 @@ OBJ = main.o
 gpu_fstep:
 
 %.o: %.cu
-	$(CUDA_CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) -O3 -g $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
 	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
 
 gpu_fstep: $(OBJ)
-	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
 all: gpu_fstep
 
diff --git a/example/Vector/1_gpu_first_step/main.cu b/example/Vector/1_gpu_first_step/main.cu
index 0bb965e80..30a69fcde 100644
--- a/example/Vector/1_gpu_first_step/main.cu
+++ b/example/Vector/1_gpu_first_step/main.cu
@@ -99,6 +99,21 @@
  *   \snippet Vector/1_gpu_first_step/main.cu using_openmpi
  *
  * * MPI must be compiled with CUDA support (in general installing OpenFPM with -g should attempt to install OpenMPI with CUDA support)
+ * 
+ * ## Macro CUDA_LAUNCH
+ *
+ * When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax
+ *
+ * my_kernel<<<wthr,thr>>>(arguments ... )
+ *
+ * Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel.
+ * Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where
+ * ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1
+ * all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility
+ * to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g)  or with CUDA_ON_CPU support
+ * (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this
+ * feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support
+ * AMD GPUs as well using this system.
  *
  * ## Full code ## {#code_e0_sim}
  *
@@ -211,7 +226,8 @@ int main(int argc, char* argv[])
 	//! \cond [launch_domain_it] \endcond
 
 	auto ite = vd.getDomainIteratorGPU();
-	translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+	// translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
 
 	//! \cond [launch_domain_it] \endcond
 
@@ -230,7 +246,8 @@ int main(int argc, char* argv[])
 	for (int j = 0 ; j < 100 ; j++)
 	{
 		auto ite = vd.getDomainIteratorGPU();
-		translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+		// translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+		CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
 
 		vd.map(RUN_ON_DEVICE);
 		vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
diff --git a/example/Vector/7_SPH_dlb_gpu/Makefile b/example/Vector/7_SPH_dlb_gpu/Makefile
index 907d4ccd1..374603b68 100644
--- a/example/Vector/7_SPH_dlb_gpu/Makefile
+++ b/example/Vector/7_SPH_dlb_gpu/Makefile
@@ -9,6 +9,8 @@ ifdef CUDA_ON_CPU
         CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
 else
         ifeq (, $(shell which nvcc))
                 CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
@@ -18,6 +20,7 @@ else
                 CUDA_CC=nvcc -ccbin=mpic++
                 CUDA_CC_LINK=nvcc -ccbin=mpic++
         endif
+	LIBS_SELECT=$(LIBS)
 endif
 
 
@@ -33,13 +36,13 @@ sph_dlb_test: OPT += -DTEST_RUN
 sph_dlb_test: sph_dlb
 
 %.o: %.cu
-	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) $(CUDA_OPTIONS) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
 	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 sph_dlb: $(OBJ)
-	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
 all: sph_dlb
 
diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu
index 08d91fbe2..50735f75d 100644
--- a/example/Vector/7_SPH_dlb_gpu/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu/main.cu
@@ -39,6 +39,21 @@
  *
  * \snippet Vector/7_SPH_dlb_gpu/main.cu mark_to_remove_kernel
  *
+ * ## Macro CUDA_LAUNCH
+ *
+ * When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax
+ *
+ * my_kernel<<<wthr,thr>>>(arguments ... )
+ *
+ * Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel. 
+ * Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where
+ * ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1
+ * all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility
+ * to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g)  or with CUDA_ON_CPU support 
+ * (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this 
+ * feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support
+ * AMD GPUs as well using this system.
+ *
  * \include Vector/7_SPH_dlb_gpu_opt/main.cu
  *
  */
@@ -195,7 +210,10 @@ inline void EqState(particles & vd)
 {
 	auto it = vd.getDomainIteratorGPU();
 
-	EqState_gpu<<<it.wthr,it.thr>>>(vd.toKernel(),B);
+	// You can use standard CUDA kernel launch or the macro CUDA_LAUNCH
+
+	//EqState_gpuning<<<it.wthr,it.thr>>>(vd.toKernel(),B);
+	CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B)
 }
 
 
@@ -301,16 +319,16 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	Point<3,real_number> xa = vd.getPos(a);
 
 	// Take the mass of the particle dependently if it is FLUID or BOUNDARY
-	real_number massa = (vd.getProp<type>(a) == FLUID)?MassFluid:MassBound;
+	real_number massa = (vd.template getProp<type>(a) == FLUID)?MassFluid:MassBound;
 
 	// Get the density of the of the particle a
-	real_number rhoa = vd.getProp<rho>(a);
+	real_number rhoa = vd.template getProp<rho>(a);
 
 	// Get the pressure of the particle a
-	real_number Pa = vd.getProp<Pressure>(a);
+	real_number Pa = vd.template getProp<Pressure>(a);
 
 	// Get the Velocity of the particle a
-	Point<3,real_number> va = vd.getProp<velocity>(a);
+	Point<3,real_number> va = vd.template getProp<velocity>(a);
 
 	// Reset the force counter (- gravity on zeta direction)
 	vd.template getProp<force>(a)[0] = 0.0;
@@ -319,7 +337,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	vd.template getProp<drho>(a) = 0.0;
 
 	// We threat FLUID particle differently from BOUNDARY PARTICLES ...
-	if (vd.getProp<type>(a) != FLUID)
+	if (vd.template getProp<type>(a) != FLUID)
 	{
 
 		// If it is a boundary particle calculate the delta rho based on equation 2
@@ -339,14 +357,14 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			if (a == b)	{++Np; continue;};
 
 			// get the mass of the particle
-			real_number massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
+			real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound;
 
 			// Get the velocity of the particle b
-			Point<3,real_number> vb = vd.getProp<velocity>(b);
+			Point<3,real_number> vb = vd.template getProp<velocity>(b);
 
 			// Get the pressure and density of particle b
-			real_number Pb = vd.getProp<Pressure>(b);
-			real_number rhob = vd.getProp<rho>(b);
+			real_number Pb = vd.template getProp<Pressure>(b);
+			real_number rhob = vd.template getProp<rho>(b);
 
 			// Get the distance between p and q
 			Point<3,real_number> dr = xa - xb;
@@ -374,7 +392,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			++Np;
 		}
 
-		vd.getProp<red>(a) = max_visc;
+		vd.template getProp<red>(a) = max_visc;
 	}
 	else
 	{
@@ -395,10 +413,10 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			// if (p == q) skip this particle
 			if (a == b)	{++Np; continue;};
 
-			real_number massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
-			Point<3,real_number> vb = vd.getProp<velocity>(b);
-			real_number Pb = vd.getProp<Pressure>(b);
-			real_number rhob = vd.getProp<rho>(b);
+			real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound;
+			Point<3,real_number> vb = vd.template getProp<velocity>(b);
+			real_number Pb = vd.template getProp<Pressure>(b);
+			real_number rhob = vd.template getProp<rho>(b);
 
 			// Get the distance between p and q
 			Point<3,real_number> dr = xa - xb;
@@ -415,7 +433,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 				Point<3,real_number> DW;
 				DWab(dr,DW,r,false);
 
-				real_number factor = - massb*((vd.getProp<Pressure>(a) + vd.getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
+				real_number factor = - massb*((vd.template getProp<Pressure>(a) + vd.template getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
 
 				vd.template getProp<force>(a)[0] += factor * DW.get(0);
 				vd.template getProp<force>(a)[1] += factor * DW.get(1);
@@ -427,7 +445,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			++Np;
 		}
 
-		vd.getProp<red>(a) = max_visc;
+		vd.template getProp<red>(a) = max_visc;
 	}
 }
 
@@ -438,7 +456,8 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 	// Update the cell-list
 	vd.updateCellList(NN);
 
-	calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar);
+	//calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar);
+	CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar)
 
 	max_visc = reduce_local<red,_max_>(vd);
 }
@@ -448,11 +467,11 @@ __global__ void max_acceleration_and_velocity_gpu(vector_type vd)
 {
 	auto a = GET_PARTICLE(vd);
 
-	Point<3,real_number> acc(vd.getProp<force>(a));
-	vd.getProp<red>(a) = norm(acc);
+	Point<3,real_number> acc(vd.template getProp<force>(a));
+	vd.template getProp<red>(a) = norm(acc);
 
-	Point<3,real_number> vel(vd.getProp<velocity>(a));
-	vd.getProp<red2>(a) = norm(vel);
+	Point<3,real_number> vel(vd.template getProp<velocity>(a));
+	vd.template getProp<red2>(a) = norm(vel);
 }
 
 void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_number & max_vel)
@@ -460,7 +479,8 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 	// Calculate the maximum acceleration
 	auto part = vd.getDomainIteratorGPU();
 
-	max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	// max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
 
 	max_acc = reduce_local<red,_max_>(vd);
 	max_vel = reduce_local<red2,_max_>(vd);
@@ -566,7 +586,8 @@ void verlet_int(particles & vd, real_number dt)
 	real_number dt205 = dt*dt*0.5;
 	real_number dt2 = dt*2.0;
 
-	verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
+	// verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
+	CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205);
 
 	//! \cond [remove_marked_part] \endcond
 
@@ -646,7 +667,8 @@ void euler_int(particles & vd, real_number dt)
 
 	real_number dt205 = dt*dt*0.5;
 
-	euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	// euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205);
 
 	// remove the particles
 	remove_marked<red>(vd);
@@ -722,7 +744,8 @@ inline void sensor_pressure(Vector & vd,
         // if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			// sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
 
 			// move calculated pressure on
 			press_tmp_.deviceToHost();
@@ -892,7 +915,7 @@ int main(int argc, char* argv[])
 	// Ok the initialization is done on CPU on GPU we are doing the main loop, so first we offload all properties on GPU
 
 	vd.hostToDevicePos();
-	vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity>();
+	vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity,velocity_prev>();
 
 	vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
-- 
GitLab