diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7aea218cc196fc229b2a562824dc4ea6667ec3a0..e3bc64a9708191b1d38fd9c2e563d294a5942c98 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -74,8 +74,6 @@ ubuntu_run:
     - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
     - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME
     - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 4 pdata 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 5 pdata 0 $CI_COMMIT_REF_NAME
     - cd openfpm_numerics
     - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics
     - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics
diff --git a/.gitmodules b/.gitmodules
index 6b30b305dd992cbc0eae8c5c1da296699017a1a4..aa6b50bf8fad1606872c2249313fc6dc2b9de64a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "openfpm_numerics"]
 	path = openfpm_numerics
 	url = ssh://git@git.mpi-cbg.de/openfpm/openfpm_numerics.git
+[submodule "gdbgui"]
+	path = gdbgui
+	url = https://github.com/incardon/gdbgui.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index afe6649ba91f03cf05853a9882aa24973044aeea..d5ed2e9d81077b8c78277cc4964c9ba659354d93 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,38 @@
 # Change Log
 All notable changes to this project will be documented in this file.
 
-## [3.1.0] 2020 (Codename New Horizons)
+## [3.3.0] April 2021 (Codename Vega)
+
+- Adding support for HIP and AMD GPU. (Only particles) 1_gpu_first/7_sph_dlb_gpu/7_sph_dlb_gpu_opt are compatible with HIP
+Additional Notes:
+- WARNING: AMD GPUs are tested manually and not in CI. This mean that out this release stuff can break at least
+           until I do not convince my working place to buy one for me ... and is gonna be hard because or rule here and there ... or who 
+           is reading this message does not want to buy one for me :-)
+- SparseGridGPU are unsupported untill AMD does not fix the bug reported here:
+           https://github.com/ROCm-Developer-Tools/HIP/issues/2260
+
+### Changes
+
+- None
+
+### Fixed
+
+- uninitialized variables in the SPH example on GPU, and other fixes necessary for AMD gpus
+
+## [3.2.0] January 2021 (Codename Hopper)
+
+- Adding CUDA_ON_CPU option to run CUDA code on CPU
+- Adding gdb-gui debugger
+
+### Fixed
+
+- Minors bugs
+
+### Changes
+
+- In order to compile OpenFPM is now required a compiler implementing C++14 Standard
+
+## [3.1.0] October 2020 (Codename New Horizons)
 
 - Adding GPU support for ghost_put
 - Adding support for CUDA 11
@@ -14,7 +45,7 @@ All notable changes to this project will be documented in this file.
 
 - None
 
-## [3.0.0] 2020 (Codename Sparsity)
+## [3.0.0] July 2020 (Codename Sparsity)
 
 - Upgrading all the dependencies: BOOST,PETSC,SUITESPARSE,OPENBLAS
 - Adding CPU and GPU sparse grids. Look at the examples SparseGrid in the forlder examples
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b1e858bc0b1f888af6c5602cb05a35ce76d0977..3ee76a524b1a1826eb1655d02f2df73ceca763e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,8 @@ if (POLICY CMP0074)
   cmake_policy(SET CMP0074 NEW)
 endif()
 
+set(openfpm_VERSION 3.3.0)
+
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
 
 set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST")
@@ -35,7 +37,23 @@ set(METIS_DIR ${METIS_ROOT})
 set(PARMETIS_DIR ${PARMETIS_ROOT})
 set(OPENBLAS_ROOT CACHE PATH "Root path for blas library")
 set(SuiteSparse_ROOT ${SUITESPARSE_ROOT})
-
+set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap")
+set(CPACK_RUN_INSTALL_DEPENDENCIES CACHE BOOL "Set to true if we are creating deb or RPM packages")
+set(ENABLE_GARBAGE_INJECTOR CACHE BOOL "Enable the injector of garbage in the memory allocator")
+set(ENABLE_VCLUSTER_GARBAGE_INJECTOR CACHE BOOL "Enable the injector of garbage in the vcluster memory buffers")
+set(HIP_ENABLE CACHE BOOL "Enable HIP compiler")
+set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels")
+
+# Enabling real GPU is stronger than using CUDA_ON_CPU
+if (ENABLE_GPU)
+	set(CUDA_ON_CPU OFF)
+	# Test CLang
+	if (NOT HIP_ENABLE)
+		find_package(CUDA)
+		set(CMAKE_CUDA_COMPILER_LIBRARY_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+        	enable_language(CUDA)
+	endif()
+endif()
 
 set (CMAKE_CXX_STANDARD 14)
 set (CMAKE_CUDA_STANDARD 14)
@@ -51,50 +69,45 @@ endif()
 set(ENV{PATH} "$ENV{PATH}:${HDF5_ROOT}/bin")
 set(HDF5_PREFER_PARALLEL TRUE)
 
-if(ENABLE_GPU)
-	enable_language(CUDA)
-	find_package(CUDA)
-
-	if (CUDA_VERSION_MAJOR EQUAL 9 AND CUDA_VERSION_MINOR EQUAL 2)
-		message("CUDA is compatible 9.2")
-		set(WARNING_SUPPRESSION_AND_OPTION_NVCC  -Xcudafe "--display_error_number --diag_suppress=611 --diag_suppress=2885 --diag_suppress=2886  --diag_suppress=2887  --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111" --expt-extended-lambda)
-		FILE(WRITE cuda_options " -Xcudafe \"--display_error_number --diag_suppress=611 --diag_suppress=2885 --diag_suppress=2886  --diag_suppress=2887  --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111\" --expt-extended-lambda ")
-	elseif ( CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR EQUAL 1 )
-                message("CUDA is compatible 10.1")
-                set(WARNING_SUPPRESSION_AND_OPTION_NVCC  -Xcudafe "--display_error_number --diag_suppress=2915 --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 " --expt-extended-lambda )
-		FILE(WRITE cuda_options "-Xcudafe \"--display_error_number --diag_suppress=2915 --diag_suppress=2914  --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 \" --expt-extended-lambda")
-        elseif ( CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR EQUAL 2 )
-                message("CUDA is compatible 10.2")
-                set(WARNING_SUPPRESSION_AND_OPTION_NVCC  -Xcudafe "--display_error_number --diag_suppress=2976 --diag_suppress=2977 --diag_suppress=2978 --diag_suppress=2979 --diag_suppress=1835 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128" --expt-extended-lambda)
-                set(WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT  "-Xcudafe \"--display_error_number --diag_suppress=2976 --diag_suppress=2977 --diag_suppress=2978 --diag_suppress=2979 --diag_suppress=1835 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda")
-		FILE(WRITE cuda_options "-Xcudafe \"--display_error_number --diag_suppress=2976 --diag_suppress=2977 --diag_suppress=2978 --diag_suppress=2979 --diag_suppress=1835 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda")
-        elseif ( CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 0 )
-                message("CUDA is compatible 11.0")
-                set(WARNING_SUPPRESSION_AND_OPTION_NVCC  -Xcudafe "--display_error_number --diag_suppress=3056 --diag_suppress=3057 --diag_suppress=3058 --diag_suppress=3059 --diag_suppress=611  --diag_suppress=186 --diag_suppress=128" --expt-extended-lambda)
-                set(WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT  "-Xcudafe \"--display_error_number --diag_suppress=3056 --diag_suppress=3057  --diag_suppress=3058 --diag_suppress=3059 --diag_suppress=611  --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda")
-                FILE(WRITE cuda_options "-Xcudafe \"--display_error_number --diag_suppress=3056 --diag_suppress=3058 --diag_suppress=3058 --diag_suppress=3059 --diag_suppress=611  --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda")
-	else()
-		message(FATAL_ERROR "CUDA is incompatible, version 9.2 10.1 10.2 and 11.0 is only supported")
-	endif()
-endif()
-
 set(Vc_DIR "${Vc_ROOT}/lib/cmake/Vc/")
 message("Searching Vc in ${Vc_DIR}")
 
-find_package(Boost 1.68.0 COMPONENTS unit_test_framework iostreams program_options system filesystem)
+find_package(Boost 1.72.0 COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context)
 find_package(MPI)
 find_package(PETSc)
 find_package(HDF5)
 find_package(Eigen3)
 find_package(LibHilbert)
-find_package(Metis)
-find_package(ParMetis)
+find_package(METIS)
+find_package(ParMETIS)
 find_package(TinyObjLoader )
 find_package(BLAS)
 find_package(LAPACK)
 find_package(Eigen3)
 find_package(SuiteSparse OPTIONAL_COMPONENTS UMFPACK)
 find_package(Vc)
+find_package(OpenMP)
+find_package(HIP)
+
+set(CMAKE_SKIP_BUILD_RPATH TRUE)
+
+if(HIP_FOUND)
+        set(DEFINE_HIP_GPU "#define HIP_GPU")
+        set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP")
+	file(WRITE hip_enabled 1)
+else()
+	file(WRITE hip_enabled 0)
+endif()
+
+if(HIP_FOUND)
+        set(DEFINE_CUDA_GPU "#define CUDA_GPU")
+endif()
+
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
 
 if(PROFILE_WITH_SCOREP)
 	set(CMAKE_CXX_COMPILER_LAUNCHER "scorep")
@@ -102,7 +115,8 @@ if(PROFILE_WITH_SCOREP)
 	set(CMAKE_CUDA_COMPILER_LAUNCHER "scorep")
 endif()
 
-if(CUDA_FOUND)
+
+if(ENABLE_GPU AND (CUDA_FOUND OR HIP_FOUND))
 	set(OPENFPM_INIT_FILE "initialize/initialize_wrapper_cuda.cu")
 else()
 	set(OPENFPM_INIT_FILE "initialize/initialize_wrapper_cpu.cpp")
@@ -138,10 +152,6 @@ if(TEST_PERFORMANCE)
 	set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST")
 endif()
 
-if(CUDA_FOUND)
-	set(DEFINE_CUDA_GPU "#define CUDA_GPU")
-endif()
-
 if (METIS_FOUND)
 	set(DEFINE_HAVE_METIS "#define HAVE_METIS 1")
 else()
@@ -156,7 +166,11 @@ else()
 	message( FATAL_ERROR "ParMetis is required in order to install OpenFPM")
 endif()
 
+
 if(MPI_FOUND)
+	get_filename_component(OPENFPM_MPI_DEP "${MPI_C_INCLUDE_DIRS}" DIRECTORY)
+	file(READ ${CMAKE_SOURCE_DIR}/src/cmake/openfpmConfig-configure.cmake CMAKE_OPENFPM_CONFIG_VARS)
+	file(WRITE ${CMAKE_SOURCE_DIR}/src/cmake/openfpmConfigVars-configure.cmake "${CMAKE_OPENFPM_CONFIG_VARS}\nset(CMAKE_PREFIX_PATH ${OPENFPM_MPI_DEP}/)")
 	set(DEFINE_HAVE_MPI "#define HAVE_MPI")
 else()
 	file(WRITE error_code "200")
@@ -164,15 +178,35 @@ else()
 endif()
 
 if (Boost_FOUND)
-	set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
-	set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
-	set(DEFINE_HAVE_BOOST_PROGRAM_OPTIONS "#define HAVE_BOOST_PROGRAM_OPTIONS")
-	set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK "#define HAVE_BOOST_UNIT_TEST_FRAMEWORK")
+        set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
+        set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
+        set(DEFINE_HAVE_BOOST_PROGRAM_OPTIONS "#define HAVE_BOOST_PROGRAM_OPTIONS")
+        set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK "#define HAVE_BOOST_UNIT_TEST_FRAMEWORK")
+        if (Boost_CONTEXT_FOUND)
+                set(DEFINE_HAVE_BOOST_CONTEXT "#define HAVE_BOOST_CONTEXT")
+		set(OPTIONAL_BOOST_LIBS "-lboost_context")
+        else()
+                #if context is not there CUDA_ON_CPU cannot be activated
+                set(CUDA_ON_CPU OFF)
+        endif()
+        if (Boost_FIBER_FOUND)
+                set(DEFINE_HAVE_BOOST_FIBER "#define HAVE_BOOST_FIBER")
+		string(CONCAT OPTIONAL_BOOST_LIBS ${OPTIONAL_BOOST_LIBS}  " -lboost_fiber")
+        endif()
+	file(WRITE optional_boost_libs "${OPTIONAL_BOOST_LIBS}")
 else()
 	file(WRITE error_code "202")
 	message( FATAL_ERROR "BOOST is required in order to install OpenFPM" )
 endif()
 
+if(ENABLE_GPU AND CUDA_FOUND)
+        set(DEFINE_CUDA_GPU "#define CUDA_GPU")
+endif()
+
+if(CUDA_ON_CPU)
+        set(DEFINE_CUDA_GPU "#define CUDA_GPU")
+endif()
+
 if(HDF5_FOUND)
 	if (HDF5_IS_PARALLEL)
 		set(DEFINE_HAVE_HDF5 "#define HAVE_HDF5")
@@ -210,6 +244,14 @@ else()
 	message( FATAL_ERROR "Vc is required in roder to install OpenFPM")
 endif()
 
+if (ENABLE_GARBAGE_INJECTOR)
+	set(DEFINE_GARBAGE_INJECTOR "#define GARBAGE_INJECTOR")
+endif()
+
+if (ENABLE_VCLUSTER_GARBAGE_INJECTOR)
+        set(DEFINE_VCLUSTER_GARBAGE_INJECTOR "#define VCLUSTER_GARBAGE_INJECTOR")
+endif()
+
 if(APPLE)
 	set(DEFINE_HAVE_OSX "#define HAVE_OSX")
 endif()
@@ -220,17 +262,109 @@ endif()
 
 file(WRITE error_code "0")
 file(WRITE cuda_lib "${CUDA_cudart_static_LIBRARY} ${CUDA_cudadevrt_LIBRARY}")
-file(WRITE cuda_include "-I${CUDA_INCLUDE_DIRS}")
+if(CUDA_ON_CPU)
+	file(WRITE cuda_include "-I${CUDA_INCLUDE_DIRS} -D__NVCC__ -DCUDART_VERSION=11000 -DCUDA_ON_CPU")
+	file(WRITE cuda_on_cpu "YES")
+else()
+	file(WRITE cuda_include "-I${CUDA_INCLUDE_DIRS}")
+	file(WRITE cuda_on_cpu "NO")
+endif()
 file(WRITE mpi_include "-I${MPI_C_INCLUDE_DIRS}")
 file(WRITE mpi_libs "${MPI_C_LINK_FLAGS} ${MPI_C_LIBRARIES}")
-file(WRITE cuda_options "${WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT}")
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/config/config_cmake.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/config/config.h)
 
-add_subdirectory (src)
-add_subdirectory (openfpm_devices)
-add_subdirectory (openfpm_data)
 add_subdirectory (openfpm_io)
-add_subdirectory (openfpm_vcluster)
 add_subdirectory (openfpm_numerics)
 
+file(WRITE cuda_options "${WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT}")
+
+add_subdirectory (src)
+
+#################### CPack to create auto installing packages
+
+include(InstallRequiredSystemLibraries)
+
+string(REPLACE "." ";" VERSION_LIST ${openfpm_VERSION})
+list(GET VERSION_LIST 0 OPENFPM_VERSION_MAJOR)
+list(GET VERSION_LIST 1 OPENFPM_VERSION_MINOR)
+list(GET VERSION_LIST 2 OPENFPM_VERSION_PATCH)
+
+if (CPACK_RUN_INSTALL_DEPENDENCIES)
+
+	set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenFPM distributed data-structures")
+	set(CPACK_PACKAGE_VENDOR "IBirdSoft")
+	set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.txt")
+	set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/gpl-3.0.txt")
+	set(CPACK_PACKAGE_VERSION_MAJOR ${OPENFPM_VERSION_MAJOR})
+	set(CPACK_PACKAGE_VERSION_MINOR ${OPENFPM_VERSION_MINOR})
+	set(CPACK_PACKAGE_VERSION_PATCH ${OPENFPM_VERSION_PATCH})
+	set(CPACK_PACKAGE_INSTALL_DIRECTORY "CMake ${CMake_VERSION_MAJOR}.${CMake_VERSION_MINOR}")
+	set(CPACK_PACKAGE_INSTALL_DIRECTORY /usr/local/openfpm)
+	set(CPACK_PACKAGING_INSTALL_PREFIX /usr/local/openfpm)
+	set(CPACK_PACKAGE_HOMEPAGE_URL http://openfpm.mpi-cbg.de)
+	set(CPACK_RPM_PACKAGE_AUTOREQPROV NO)
+	set(CPACK_PACKAGE_CONTACT incardon@mpi-cbg.de)
+	set(CPACK_DEBIAN_PACKAGE_MAINTAINER Pietro Incardona)
+	set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/scripts/postinst)
+	set(CPACK_POSTFLIGHT_OPENFPM_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/scripts/postflight)
+	set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/src/script/postinst")
+	set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.txt")
+
+	install(FILES $ENV{DEP_PACKING}/openfpm_vars
+		DESTINATION source
+              	COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/BOOST
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/EIGEN
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/HDF5
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/LIBHILBERT
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/METIS
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/MPI
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/OPENBLAS
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/PARMETIS
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/PETSC
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+        install(DIRECTORY $ENV{DEP_PACKING}/SUITESPARSE
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+	install(DIRECTORY $ENV{DEP_PACKING}/VCDEVEL
+                DESTINATION dependencies/
+                COMPONENT OpenFPM)
+
+endif()
+
+include(CPack)
+
+cpack_add_component(OpenFPM
+                    DISPLAY_NAME OpenFPM
+                    DESCRITION OpenFPM binary files)
+
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..01b4977e08a09a9040e8108b10aad9f4809eab31
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+all:
+	$(MAKE) -C build $@
+
+clean:
+	$(MAKE) -C build $@
+
+install:
+	$(MAKE) -C build $@
+	script/install_parallel_debugger
+
+pdata:
+	$(MAKE) -C build $@
+
+numerics:
+	$(MAKE) -C build $@
+
+.PHONY: all clean install
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d10c8605e4414c054827bfb5d7adc3200dcd397
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,25 @@
+# OpenFPM
+
+OpenFPM is a scalable and open C++ framework for particles and mesh simulation
+
+You can build scalable: Molecular dynamic,SPH, Granular flow,Finite differences, Particle Mesh codes on CPUs and GPUs. We provide examples for each case here. Despite our main interest in simulations the distributed data-structures OpenFPM provides are not limited to simulations. The work is released under GPL 3.0
+ 
+
+If you use OpenFPM please cite this paper
+
+https://www.sciencedirect.com/science/article/pii/S0010465519300852?via%3Dihub
+
+## Installation
+
+To install run ./install and follow instructions
+
+At the end of the installation rememerb to to make install
+
+# Example and documentation
+
+Example and documentation can be founded at openfpm.mpi-cbg.de
+
+
+
+
+
diff --git a/build.sh b/build.sh
index f565a95193ec51969c7adf67c7c81c63e9b83910..f26c0a6d1ccad1aae8b45701d2cdb31d514bbd20 100755
--- a/build.sh
+++ b/build.sh
@@ -30,6 +30,7 @@ if [ x"$hostname" == x"cifarm-ubuntu-node"  ]; then
 #	rm -rf $HOME/openfpm_dependencies/openfpm_pdata/$branch/
 	echo "Ubuntu node"
 	./install_MPI_mpich.sh $HOME/openfpm_dependencies/openfpm_pdata/$branch/ 4
+	export PATH="/opt/bin:$PATH"
 fi
 
 if [ x"$hostname" == x"cifarm-mac-node.mpi-cbg.de"  ]; then
@@ -44,6 +45,9 @@ if [ x"$hostname" == x"falcon1" ]; then
 	if [ x"$comp_type" == x"intel" ]; then
         	module load parallel_studio_xe/2019u1
 		dependency_dir=/projects/ppm/rundeck/openfpm_dependencies_intel/
+        elif [ x"$with_gpu" == x"" ]; then
+		mkdir /projects/ppm/rundeck/openfpm_dependencies_${branch}_no_cuda/
+                dependency_dir=/projects/ppm/rundeck/openfpm_dependencies_${branch}_no_cuda/
 	else
 		mkdir /projects/ppm/rundeck/openfpm_dependencies_$branch/
 		dependency_dir=/projects/ppm/rundeck/openfpm_dependencies_$branch/
@@ -72,31 +76,37 @@ mkdir openfpm_numerics/src/config
 echo "Compiling general"
 
 source ~/.bashrc
- 
-installation_dir="--prefix=$HOME/openfpm_install/$branch"
+
+if [ x"$comp_type" != x"full" ]; then
+	installation_dir=" "
+else
+	installation_dir="--prefix=$HOME/openfpm_install/$branch"
+fi
 
 # force ssh to not use HostKey verification
 #echo "StrictHostKeyChecking=no" > $HOME/.ssh/config
 #chmod 600 $HOME/.ssh/config
 
-foward_options=
+foward_options="--enable-cuda-on-cpu"
 install_options=
 if [ x"$comp_type" == x"full" ]; then
-        install_options="-s"
+        install_options="-s "
 elif [ x"$comp_type" == x"intel" ]; then
-        install_options=" "
+        install_options="-s "
 else
-        install_options="-s -m"
+        install_options="-s -m "
 fi
 
 if [ x"$comp_type" == x"se_class" ]; then
-	foward_options="--enable-se-class1 --with-action-on-error=THROW_ON_ERROR"
+	foward_options="$foward_options --enable-se-class1 --with-action-on-error=THROW_ON_ERROR"
+elif [ x"$comp_type" == x"garbageinjv" ]; then
+	foward_options="$foward_options  --enable-garbageinjv"
 elif [ x"$comp_type" == x"asan" ]; then
         foward_options="$foward_options --enable-asan"
 fi
 
 echo "Installing with: ./install $gpu_support  -i $dependency_dir $install_options -c \"$installation_dir $foward_options  \"  "
-./install $gpu_support -i $dependency_dir $install_options -c "$installation_dir $foward_options "
+nice -n 19 ./install $gpu_support -i $dependency_dir $install_options -c "$installation_dir $foward_options "
 if [ $? -ne 0 ]; then
     echo "Fail to ./install"
     exit 1 ;
@@ -104,16 +114,24 @@ fi
 
 # Check of we have to do a make install
 if [ x"$comp_type" == x"full" ]; then
+    mv $HOME/openfpm_vars $HOME/openfpm_vars_$branch
     make install
+    if [ x"$?" != x"0" ]; then
+        exit 1
+    fi
 else
     echo "Make install partial"
-    mv $HOME/openfpm_vars $HOME/openfpm_vars_$branch
+    if [ x"$comp_type" == x"intel" ]; then
+        mv $HOME/openfpm_vars $HOME/openfpm_vars_intel
+    else
+        mv $HOME/openfpm_vars $HOME/openfpm_vars_$branch
+    fi
     source $HOME/openfpm_vars_$branch
     if [ x"$hostname" == x"suitcase" ]; then
       echo "Running make on 1 cores"
       make VERBOSE=1 -j 1
     else
-      make VERBOSE=1 -j 8
+      nice -n 19 make VERBOSE=1 -j 8
     fi
 fi
 
diff --git a/cmake_modules/FindMetis.cmake b/cmake_modules/FindMETIS.cmake
similarity index 100%
rename from cmake_modules/FindMetis.cmake
rename to cmake_modules/FindMETIS.cmake
diff --git a/cmake_modules/FindParMetis.cmake b/cmake_modules/FindParMETIS.cmake
similarity index 100%
rename from cmake_modules/FindParMetis.cmake
rename to cmake_modules/FindParMETIS.cmake
diff --git a/configure b/configure
index 14c562cfe2797169944c0825802751cda6b033ca..7853c20d07d671ea69ae78490bebac5dea937ed3 100755
--- a/configure
+++ b/configure
@@ -100,6 +100,7 @@ enable_debug
 with_metis
 with_hdf5
 with_libhilbert
+enable_cuda_on_cpu
 enable_scan_coverty
 enable_test_performance
 enable_test_coverage
@@ -107,6 +108,7 @@ with_parmetis
 enable_se_class1
 enable_se_class2
 enable_se_class3
+with_alpaka
 with_action_on_error
 with_boost
 with_boost_libdir
@@ -120,7 +122,10 @@ with_petsc
 with_eigen
 with_vcdevel
 enable_gpu
+enable_hip
 enable_asan
+enable_garbageinj
+enable_garbageinjv
 '
 
 rm -rf build
@@ -237,20 +242,34 @@ do
     scan_coverty)
        conf_options="$conf_options -DSCAN_COVERTY=ON"
        ;;
+    cuda_on_cpu)
+       conf_options="$conf_options -DCUDA_ON_CPU=ON"
+       ;;
     test_performance)
        conf_options="$conf_options -DTEST_PERFORMANCE=ON"
        ;;
     gpu)
-        if [ x"$CXX" == x"" ]; then
+	conf_options="$conf_options -DCMAKE_CUDA_HOST_COMPILER=$(which mpic++) "
+        if [ x"$CXXCUDA" == x"" ]; then
                 conf_options="$conf_options"
         else
-		conf_options="$conf_options -DCMAKE_CUDA_HOST_COMPILER=$(which $CXX)"
+                conf_options="$conf_options -DCMAKE_CUDA_COMPILER=$(which $CXXCUDA)"
         fi
        conf_options="$conf_options -DENABLE_GPU=ON"
        ;;
+    hip)
+       conf_options="$conf_options -DHIP_ENABLE=ON -DENABLE_GPU=ON"
+       enable_hip_conf=1
+       ;;
     asan)
        conf_options="$conf_options -DENABLE_ASAN=ON"
        ;;
+    garbageinj)
+       conf_options="$conf_options -DENABLE_GARBAGE_INJECTOR=ON"
+       ;;
+    garbageinjv)
+       conf_options="$conf_options -DENABLE_VCLUSTER_GARBAGE_INJECTOR=ON"
+       ;;
     *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
        ac_unrecognized_sep=', '
        ;;
@@ -493,6 +512,9 @@ do
       vcdevel)
       conf_options="$conf_options -DVc_ROOT=$ac_optarg"
       ;;
+      alpaka)
+      conf_options="$conf_options -DALPAKA_ROOT=$ac_optarg"
+      ;;
       *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
 	 ac_unrecognized_sep=', ';;
     esac
@@ -564,7 +586,12 @@ fi
 cd build
 
 ## remove enerything
+if [ x"$enable_hip_conf" != x"1" ]; then
+	conf_options="$conf_options -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++"
+fi
+
 echo "Calling cmake ../. $conf_options"
+printf "cmake ../. $conf_options" > cmake_build_options 
 rm ../error_code
 DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. $conf_options
 if [ $? != 0 ]; then
@@ -593,6 +620,7 @@ clean:
 
 install:
 	\$(MAKE) -C build \$@
+	script/install_parallel_debugger
 
 pdata:
 	\$(MAKE) -C build \$@
diff --git a/example/Grid/0_simple/Makefile b/example/Grid/0_simple/Makefile
index 26bd7de19ab527d116d6ee67307b07d016111c44..3310e13cd60792dacaa16edd0721e562db1fb3d4 100644
--- a/example/Grid/0_simple/Makefile
+++ b/example/Grid/0_simple/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 grid: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ grid: $(OBJ)
 all: grid
 
 run: all
-	mpirun -np 2 ./grid
+	mpirun --oversubscribe -np 2 ./grid
 
 .PHONY: clean all run
 
diff --git a/example/Grid/1_stencil/Makefile b/example/Grid/1_stencil/Makefile
index bb7b0ec2e27f77da6ea3ebccae9f34147e128bf4..94a73917d47e6e112f60c4b6e549929ccaf26e3c 100644
--- a/example/Grid/1_stencil/Makefile
+++ b/example/Grid/1_stencil/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 stencil: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ stencil: $(OBJ)
 all: stencil
 
 run: all
-	mpirun -np 3 ./stencil
+	mpirun --oversubscribe -np 3 ./stencil
 
 .PHONY: clean all run
 
diff --git a/example/Grid/2_solve_eq/Makefile b/example/Grid/2_solve_eq/Makefile
index 9f4b7160aaedb1c193c95ca1d4bdf594176e6a97..cab7ab23bacccffa070046aa36d775ec8c0152a4 100644
--- a/example/Grid/2_solve_eq/Makefile
+++ b/example/Grid/2_solve_eq/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 periodic: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ periodic: $(OBJ)
 all: periodic
 
 run: all
-	mpirun -np 4 ./periodic
+	mpirun --oversubscribe -np 4 ./periodic
 
 .PHONY: clean all run
 
diff --git a/example/Grid/3_gray_scott/Makefile b/example/Grid/3_gray_scott/Makefile
index 170e428569ba200362dfa7bbc4a5d533f4006696..35f9eb4aff157ab02d66c25c171bf79dc440e209 100644
--- a/example/Grid/3_gray_scott/Makefile
+++ b/example/Grid/3_gray_scott/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 gray_scott: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ gray_scott: $(OBJ)
 all: gray_scott
 
 run: all
-	mpirun -np 4 ./gray_scott
+	mpirun --oversubscribe -np 4 ./gray_scott
 
 .PHONY: clean all run
 
diff --git a/example/Grid/3_gray_scott_3d/Makefile b/example/Grid/3_gray_scott_3d/Makefile
index 04db7e20fa78c9a519f254e71d61f27e47df69f8..d27878771713269caf29430989fdd7d1fcd9510b 100644
--- a/example/Grid/3_gray_scott_3d/Makefile
+++ b/example/Grid/3_gray_scott_3d/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 gray_scott: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ gray_scott: $(OBJ)
 all: gray_scott
 
 run: all
-	mpirun -np 4 ./gray_scott
+	mpirun --oversubscribe -np 4 ./gray_scott
 
 .PHONY: clean all run
 
diff --git a/example/Grid/3_gray_scott_3d/main.cpp b/example/Grid/3_gray_scott_3d/main.cpp
index a72029fc694b2ae24f3b560e45c1a78d6e305b40..82ea1fd77b2cde05b6954fc8e63d608791104a02 100644
--- a/example/Grid/3_gray_scott_3d/main.cpp
+++ b/example/Grid/3_gray_scott_3d/main.cpp
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])
 		// visualization
 		if (i % 500 == 0)
 		{
-			Old.save("output_" + std::to_string(count));
+//			Old.save("output_" + std::to_string(count));
 			count++;
 		}
 	}
diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile
index 4993b0c8aed55bfb12cfa516360601b7d1ca1ade..525a4be088c34ac37e0a8f85840b9d86cd46fc39 100644
--- a/example/Grid/3_gray_scott_3d_vectorization/Makefile
+++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile
@@ -10,7 +10,7 @@ OBJ = main.o update_new.o
 	mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore  -fimplicit-none  -mavx -O3 -c -g -o $@ $<
 
 %.o: %.cpp
-	$(CC) -O3 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -mavx  -g -c --std=c++14 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH)
 
 gray_scott: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -18,7 +18,7 @@ gray_scott: $(OBJ)
 all: gray_scott
 
 run: all
-	mpirun -np 4 ./gray_scott
+	mpirun --oversubscribe -np 4 ./gray_scott
 
 .PHONY: clean all run
 
diff --git a/example/Numerics/PS-CMA-ES/Makefile b/example/Numerics/PS-CMA-ES/Makefile
index 73ca664f745ea72ea51936a457c3afc17a572bb0..9771a523c8c76fc374491bcf899cb8c2b2d1e47d 100644
--- a/example/Numerics/PS-CMA-ES/Makefile
+++ b/example/Numerics/PS-CMA-ES/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -I/usr/local/cuda/include  -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -I/usr/local/cuda/include  -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 ps_cma_es: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ ps_cma_es: $(OBJ)
 all: ps_cma_es
 
 run: all
-	mpirun -np 2 ./ps_cma_es
+	mpirun --oversubscribe  -np 2 ./ps_cma_es
 
 .PHONY: clean all run
 
diff --git a/example/Numerics/PSE/0_Derivative_approx_1D/Makefile b/example/Numerics/PSE/0_Derivative_approx_1D/Makefile
index da76a97b081f678675d3c9ca416664098dcdd671..a0931408a15e36353f4d6e078c4ae0188eeb672d 100644
--- a/example/Numerics/PSE/0_Derivative_approx_1D/Makefile
+++ b/example/Numerics/PSE/0_Derivative_approx_1D/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 pse_1d: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
diff --git a/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile b/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile
index a89794a539aa80a4703f3d8a138b3eff72242635..b71fe61f2a212a0eaaee58c17be6abb23467dac2 100644
--- a/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile
+++ b/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile
@@ -10,7 +10,7 @@ LDIR =
 OBJ_128 = main_float128.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 # pse_1d_128: $(OBJ_128)
 # 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -lquadmath
diff --git a/example/Numerics/PSE/1_Diffusion_1D/Makefile b/example/Numerics/PSE/1_Diffusion_1D/Makefile
index 502affc0c3379bdd1e290d47424388bf2aa333ba..a1485efa1ab9e9729f9486c0444075fc9c60cd2c 100644
--- a/example/Numerics/PSE/1_Diffusion_1D/Makefile
+++ b/example/Numerics/PSE/1_Diffusion_1D/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 diff_1d: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ diff_1d: $(OBJ)
 all: diff_1d
 
 run: all
-	mpirun -np 4 ./diff_1d
+	mpirun --oversubscribe -np 4 ./diff_1d
 
 .PHONY: clean all
 
diff --git a/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile b/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile
index 1378bd05620783aa79a98e4e354aa19131c86921..7fd1a1819764a5cef54dbd74b751e6fd6b7829d3 100644
--- a/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile
+++ b/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile
@@ -8,7 +8,7 @@ OBJ_EIGEN = main_eigen.o
 OBJ_PETSC = main_petsc.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 all: stokes_2d_eigen stokes_2d_petsc
 
@@ -19,7 +19,7 @@ stokes_2d_petsc: $(OBJ_PETSC)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: all
-	mpirun -np 3 ./stokes_2d_eigen && mpirun -np 3 ./stokes_2d_petsc
+	mpirun --oversubscribe -np 3 ./stokes_2d_eigen && mpirun --oversubscribe -np 3 ./stokes_2d_petsc
 	
 .PHONY: clean all
 
diff --git a/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile b/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile
index 5186ac565a358b610c6ed1df82c9dd83fb310bd4..0b6058ade2e92cd24fd22fcf7fdec6c3fa528d4e 100644
--- a/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile
+++ b/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile
@@ -8,7 +8,7 @@ OBJ_EIGEN = main_eigen.o
 OBJ_PETSC = main_petsc.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 all: stokes_3d_eigen stokes_3d_petsc
 
@@ -19,7 +19,7 @@ stokes_3d_petsc: $(OBJ_PETSC)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: all
-	mpirun -np 3 ./stokes_3d_eigen && mpirun -np 3 ./stokes_3d_petsc
+	mpirun --oversubscribe -np 3 ./stokes_3d_eigen && mpirun --oversubscribe -np 3 ./stokes_3d_petsc
 	
 .PHONY: clean all run
 
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile
index 495aa0d896a024280e1d64e7bff9eb0681ba0c6e..78f653991dbbe9895941ed52ff7dc9b1e87fcfb0 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile
+++ b/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 example_sussman_circle: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ example_sussman_circle: $(OBJ)
 all: example_sussman_circle
 
 run: all
-	mpirun -np 2 ./example_sussman_circle
+	mpirun --oversubscribe -np 2 ./example_sussman_circle
 
 .PHONY: clean all run
 
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp
index 5ae63429601373b5f6888592f0187fd08fe999e4..b19546737a5117277bb1417dd2de5128db88093e 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp
@@ -76,7 +76,6 @@
  * Once we have received the Phi_SDF from the redistancing, particles can be placed on narrow band around the interface.
  *
  * * Creates filled 2D circle with -1/+1 indicator function
- * * Runs gaussian filter if gradient at interface too steep or if user sets redist_options.sigma >= 1
  * * Runs Sussman redistancing (see @ref RedistancingSussman.hpp)
  * * Places particles on narrow band around interface
  *
@@ -226,10 +225,6 @@ int main(int argc, char* argv[])
 	 * For the redistancing, we can choose some options. These options will then be passed bundled as a structure to
 	 * the redistancing function. Setting these options is optional, since they all have a Default value as well. In
 	 * particular the following options can be set by the user:
-	 * * \p sigma: Sigma of the gaussian kernel, which is used for gaussian smooting Phi_0. If the initial gradient of
-	 *             phi_0 at the interface is too large and no sigma is chosen or chosen too small, gauss smoothing will
-	 *             automatically be applied until phi gradient magnitude <= 12, regardless of which sigma is chosen by
-	 *             the user. Default = 0.
 	 * * \p min_iter: Minimum number of iterations before steady state in narrow band will be checked (Default: 100).
 	 * * \p max_iter: Maximum number of iterations you want to run the redistancing, even if steady state might not yet
 	 *                have been reached (Default: 1e6).
@@ -260,7 +255,6 @@ int main(int argc, char* argv[])
 	// Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1.
 	// For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options.
 	Redist_options redist_options;
-	redist_options.sigma                                = 0;
 	redist_options.min_iter                             = 100;
 	redist_options.max_iter                             = 10000;
 	
@@ -330,7 +324,8 @@ int main(int argc, char* argv[])
 	// the magnitude of the gradient
 	typedef aggregate<double, double[grid_dim], double> props_nb;
 	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-	vd_type vd_narrow_band(0, box, bc, ghost);
+	Ghost<grid_dim, double> ghost_vd(0);
+	vd_type vd_narrow_band(0, box, bc, ghost_vd);
 	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	//! @cond [Initialize narrow band] @endcond
 	
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile
index 54c26c019feace480d5d52e1b9c0ab6c45a9b0b2..4ad44cdf7ade19319cdd041820ea386d5a9a2590 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile
+++ b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 example_sussman_images: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ example_sussman_images: $(OBJ)
 all: example_sussman_images
 
 run: all
-	mpirun -np 2 ./example_sussman_images
+	mpirun --oversubscribe -np 2 ./example_sussman_images
 
 .PHONY: clean all run
 
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp
index 18574a4b329457ebfa2d43352b6e392f57a7e04e..70668cee7273fe42abc0272232813a798dd1e025 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp
@@ -197,7 +197,6 @@ int main(int argc, char* argv[])
 	// For the initial re-distancing we use the Sussman method
 	// 1.) Set some redistancing options
 	Redist_options redist_options;
-	redist_options.sigma                                = 0;        // if the initial gradient of phi at the interface is too large, gauss smoothing will automatically be applied until phi gradient magnitude <= 12, regardless of the given sigma in the main
 	redist_options.min_iter                             = 100;      // min. number of iterations before steady state in narrow band will be checked (default: 100)
 	redist_options.max_iter                             = 10000;    // max. number of iterations you want to run the
 																	// redistancing, even if steady state might not yet have been reached (default: 1e6)
@@ -229,7 +228,8 @@ int main(int argc, char* argv[])
 	// the magnitude of the gradient
 	typedef aggregate<double, double[grid_dim], double> props_nb;
 	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-	vd_type vd_narrow_band(0, box, bc, ghost);
+        Ghost<grid_dim, double> ghost_vd(0);
+        vd_type vd_narrow_band(0, box, bc, ghost_vd);
 	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	
 	NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile
index 54c26c019feace480d5d52e1b9c0ab6c45a9b0b2..4ad44cdf7ade19319cdd041820ea386d5a9a2590 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile
+++ b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 example_sussman_images: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ example_sussman_images: $(OBJ)
 all: example_sussman_images
 
 run: all
-	mpirun -np 2 ./example_sussman_images
+	mpirun --oversubscribe -np 2 ./example_sussman_images
 
 .PHONY: clean all run
 
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
index 7e229fb9513b4086da19f6124c240aba75de44fb..fcd1cbdd1aefa4fab0c24cdcb76b14b382948364 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
@@ -191,7 +191,6 @@ int main(int argc, char* argv[])
 	// For the initial re-distancing we use the Sussman method
 	// 1.) Set some redistancing options
 	Redist_options redist_options;
-	redist_options.sigma                                = 0;        // if the initial gradient of phi at the interface is too large, gauss smoothing will automatically be applied until phi gradient magnitude <= 12, regardless of the given sigma in the main
 	redist_options.min_iter                             = 100;      // min. number of iterations before steady state in narrow band will be checked (default: 100)
 	redist_options.max_iter                             = 10000;    // max. number of iterations you want to run the
 															        // redistancing, even if steady state might not yet
@@ -225,8 +224,9 @@ int main(int argc, char* argv[])
 	// the magnitude of the gradient
 	typedef aggregate<double, double[grid_dim], double> props_nb;
 	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-	vd_type vd_narrow_band(0, box, bc, ghost);
-	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
+        Ghost<grid_dim, double> ghost_vd(0);
+        vd_type vd_narrow_band(0, box, bc, ghost_vd);	
+        vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	
 	NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class
 	
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile
index 714fdcb07f0f4ba0c2b68c3afa616f7637b5846f..eb52eba53cd7592ebc8cc29eb5d6a2b0591e50b4 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile
+++ b/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 example_sussman_sphere: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ example_sussman_sphere: $(OBJ)
 all: example_sussman_sphere
 
 run: all
-	mpirun -np 2 ./example_sussman_sphere
+	mpirun --oversubscribe -np 2 ./example_sussman_sphere
 
 .PHONY: clean all run
 
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp
index c3286722a15f8f24667e3595029190922e55b308..392f9dc36abd5b3e5d690bd90ff40186d7381c61 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp
@@ -35,7 +35,6 @@
  * Once we have received the Phi_SDF from the redistancing, particles can be placed on narrow band around the interface.
  *
  * * Creates filled 3D sphere with -1/+1 indicator function
- * * Runs gaussian filter if gradient at interface too steep or if user sets redist_options.sigma >= 1
  * * Runs Sussman redistancing (see @ref RedistancingSussman.hpp)
  * * Places particles on narrow band around interface
  *
@@ -187,10 +186,6 @@ int main(int argc, char* argv[])
 	 * For the redistancing, we can choose some options. These options will then be passed bundled as a structure to
 	 * the redistancing function. Setting these options is optional, since they all have a Default value as well. In
 	 * particular the following options can be set by the user:
-	 * * \p sigma: Sigma of the gaussian kernel, which is used for gaussian smooting Phi_0. If the initial gradient of
-	 *             phi_0 at the interface is too large and no sigma is chosen or chosen too small, gauss smoothing will
-	 *             automatically be applied until phi gradient magnitude <= 12, regardless of which sigma is chosen by
-	 *             the user. Default = 0.
 	 * * \p min_iter: Minimum number of iterations before steady state in narrow band will be checked (Default: 100).
 	 * * \p max_iter: Maximum number of iterations you want to run the redistancing, even if steady state might not yet
 	 *                have been reached (Default: 1e6).
@@ -221,9 +216,8 @@ int main(int argc, char* argv[])
 	// Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1.
 	// For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options.
 	Redist_options redist_options;
-	redist_options.sigma                                = 0;
 	redist_options.min_iter                             = 100;
-	redist_options.max_iter                             = 10000;
+	redist_options.max_iter                             = 1000;
 	
 	redist_options.convTolChange.value                  = 1e-12;
 	redist_options.convTolChange.check                  = true;
@@ -290,7 +284,8 @@ int main(int argc, char* argv[])
 	// the magnitude of the gradient
 	typedef aggregate<double, double[grid_dim], double> props_nb;
 	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-	vd_type vd_narrow_band(0, box, bc, ghost);
+        Ghost<grid_dim, double> ghost_vd(0);
+        vd_type vd_narrow_band(0, box, bc, ghost_vd);
 	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	//! @cond [Initialize narrow band] @endcond
 	
diff --git a/example/Numerics/Vortex_in_cell/Makefile b/example/Numerics/Vortex_in_cell/Makefile
index 0bebd2fc1a1d03617988ffba29ffec48ef2f0255..109cf30c80190734366c589cdfa43b78a48e239b 100644
--- a/example/Numerics/Vortex_in_cell/Makefile
+++ b/example/Numerics/Vortex_in_cell/Makefile
@@ -14,7 +14,7 @@ vic_petsc_test: OPT += -DTEST_RUN
 vic_petsc_test: vic_petsc vic_petsc_opt
 
 %.o: %.cpp
-	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 vic_petsc_opt: $(OBJ_VIC_PETSC_OPT)
 	$(CC) -o $@ $^ $(LIBS_PATH) $(LIBS)
@@ -23,7 +23,7 @@ vic_petsc: $(OBJ_VIC_PETSC)
 	$(CC) -o $@ $^ $(LIBS_PATH) $(LIBS)
 	
 run: vic_petsc_test
-	mpirun -np 4 ./vic_petsc && mpirun -np 4 ./vic_petsc_opt
+	mpirun --oversubscribe -np 4 ./vic_petsc && mpirun --oversubscribe -np 4 ./vic_petsc_opt
 	
 .PHONY: clean all
 
diff --git a/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp b/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp
index 5c4d29a81579e50baa687071f2d78f281ba18cbd..0eb8c9d43ab591f2ce1eee4304ed1090f279edc5 100644
--- a/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp
+++ b/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp
@@ -505,9 +505,9 @@ template<typename grid> void calc_rhs(grid & g_vort, grid & g_vel, grid & g_dwp)
 
 	// calculate several pre-factors for the stencil finite
 	// difference
-	float fac1 = 1.0f*nu/(g_vort.spacing(0)*g_vort.spacing(0));
-	float fac2 = 1.0f*nu/(g_vort.spacing(1)*g_vort.spacing(1));
-	float fac3 = 1.0f*nu/(g_vort.spacing(2)*g_vort.spacing(2));
+	float fac1 = 2.0f*nu/(g_vort.spacing(0)*g_vort.spacing(0));
+	float fac2 = 2.0f*nu/(g_vort.spacing(1)*g_vort.spacing(1));
+	float fac3 = 2.0f*nu/(g_vort.spacing(2)*g_vort.spacing(2));
 
 	float fac4 = 0.5f/(g_vort.spacing(0));
 	float fac5 = 0.5f/(g_vort.spacing(1));
diff --git a/example/Plot/0_simple_graph/Makefile b/example/Plot/0_simple_graph/Makefile
index e2885ae2a21a84bd8bb036284446869a47318c85..e8f9e113e526ff1b5110d2551599850a6c6b6417 100644
--- a/example/Plot/0_simple_graph/Makefile
+++ b/example/Plot/0_simple_graph/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -g3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 plot: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse/Makefile b/example/SparseGrid/1_gray_scott_3d_sparse/Makefile
index 192287e8e9fad169168e1b4d50d20ea0e1c66ede..f1b36f4371e58cf581469ab6a349aa96f2a45b23 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse/Makefile
+++ b/example/SparseGrid/1_gray_scott_3d_sparse/Makefile
@@ -9,7 +9,7 @@ gray_scott_sparse_test: OPT += -DTEST_RUN
 gray_scott_sparse_test: gray_scott_sparse
 
 %.o: %.cpp
-	$(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 gray_scott_sparse: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -17,7 +17,7 @@ gray_scott_sparse: $(OBJ)
 all: gray_scott_sparse
 
 run: gray_scott_sparse_test
-	mpirun -np 4 ./gray_scott_sparse
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse
 
 .PHONY: clean all run
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp b/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp
index 872edcb83abb8fb30a11bde967f3436f6d0585ac..f26892a96a14a064139c9ef7fcae07d755c5f29a 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp
+++ b/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp
@@ -1,9 +1,10 @@
  /*! \page SparseGrid SparseGrid
  *
  * \subpage Grid_3_gs_3D_sparse
- * \subpage Grid_3_gs_3D_sparse_cs
- * \subpage Grid_3_gs_3D_sparse_opt
  * \subpage Grid_3_gs_3D_sparse_gpu
+ * \subpage Grid_3_gs_3D_sparse_opt
+ * \subpage Grid_3_gs_3D_sparse_gpu_opt
+ * \subpage Grid_3_gs_3D_sparse_cs
  * \subpage Grid_3_gs_3D_sparse_gpu_cs
  *
  */
@@ -14,14 +15,13 @@
 
 /*!
  *
- * \page Grid_3_gs_3D_sparse Gray Scott in 3D using sparse grids
+ * \page Grid_3_gs_3D_sparse Gray Scott in\subpage Grid_3_gs_3D_sparse_opt 3D using sparse grids
  *
  * [TOC]
  *
- * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott_sparse}
+ * # Solving a gray scott-system in 3D using Sparse grids on GPU optimized # {#e3_gs_gray_scott_sparse_gpu_opt}
  *
- * This example show how to solve a Gray-Scott system in 3D using sparse grids in this case we well use a more
- * complex geometry
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu (optimized). The problem is the same as \ref Grid_3_gs_3D
  *
  * In figure is the final solution of the problem
  *
@@ -34,16 +34,7 @@
  * \see \ref Grid_3_gs_3D
  *
  *
- * We recall here the main differences between sparse and dense.
- *
- * * **get** function return now constant values, so cannot be used to get values, a get in write is an insert
- *   a get on a point position that has not been inserted return the background value
- *
- * * **insert** function create/overwrite the points value
- *
- * * **getDomainIterator** return an iterator on the existing points
- *
- * * **getGridIterator** return an iterator on the dense version of the grid
+ * 
  *
  *
  * 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile
index c0c9bee799cd7ff6adb7939c66f76eb09e6495ae..d815f8d56145b1bbe9bb064359549e4943477684 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile
+++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile
@@ -3,15 +3,27 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
-ifeq (, $(shell which nvcc))
+ifdef CUDA_ON_CPU
         CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-        CUDA_CC_LINK=nvcc -ccbin=mpic++
+        ifeq (, $(shell which nvcc))
+                CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                INCLUDE_PATH_NVCC=
+                CUDA_CC_LINK=mpic++
+                CUDA_OPTIONS=
+        else
+                CUDA_CC=nvcc -ccbin=mpic++
+                CUDA_CC_LINK=nvcc -ccbin=mpic++
+                CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+        endif
+        LIBS_SELECT=$(LIBS)
 endif
 
+
 gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
 gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
 
@@ -25,12 +37,12 @@ OBJ = main.o
 	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 gray_scott_sparse_gpu: $(OBJ)
-	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT)
 
 all: gray_scott_sparse_gpu
 
 run: gray_scott_sparse_gpu_test
-	mpirun -np 4 ./gray_scott_sparse_gpu
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
 
 .PHONY: clean all run
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu
index 1ccd9d4a9dfa4692e9b868294fdbe21a07289555..794f5bc0815106c18ae19a2a69034a5fed835bd5 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu
+++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu
@@ -1,4 +1,3 @@
-#include "util/cuda/cuda_launch.hpp"
 #include "Grid/grid_dist_id.hpp"
 #include "data_type/aggregate.hpp"
 #include "timer.hpp"
@@ -73,8 +72,12 @@ constexpr int x = 0;
 constexpr int y = 1;
 constexpr int z = 2;
 
+//! \cond [grid definition] \endcond
+
 typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float> > SparseGridType;
 
+//! \cond [grid definition] \endcond
+
 void init(SparseGridType & grid, Box<3,float> & domain)
 {
 	//! \cond [create points] \endcond
diff --git a/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7c80e2810f9bc30ccf4df3c880e5d1a58876fca8
--- /dev/null
+++ b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile
@@ -0,0 +1,52 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifdef CUDA_ON_CPU
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+else
+        ifeq (, $(shell which nvcc))
+                CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                INCLUDE_PATH_NVCC=
+                CUDA_CC_LINK=mpic++
+                CUDA_OPTIONS=
+        else
+                CUDA_CC=nvcc -ccbin=mpic++
+                CUDA_CC_LINK=nvcc -ccbin=mpic++
+                CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+        endif
+        LIBS_SELECT=$(LIBS)
+endif
+
+
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/config.cfg b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/config.cfg
rename to example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/config.cfg
diff --git a/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..116a76a96fd3279ad56a451a33655a3f2d1400e0
--- /dev/null
+++ b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu
@@ -0,0 +1,286 @@
+//#define VCLUSTER_PERF_REPORT <------ Activate telemetry for the VCluster data-structure
+//#define SYNC_BEFORE_TAKE_TIME <------ Force synchronization of the kernels everytime we take the time with the structure timer.
+//                                      Use this option for telemetry and GPU otherwise the result are unreliable                                        
+//#define ENABLE_GRID_DIST_ID_PERF_STATS  <------ Activate telementry for the grid data-structure
+
+#include "Decomposition/Distribution/BoxDistribution.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu_opt Gray Scott in 3D using sparse grids on GPU (Optimized)
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids on gpu (Optimized) # {#e3_gs_gray_scott_gpu}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * More or less this example is the adaptation of the dense example in 3D
+ *
+ * \see \ref Grid_3_gs_3D
+ *
+ * # Optimizations
+ *
+ * Instead of using the default decomposition algorithm based on parmetis we use BoxDistribution. This decomposition divide the space equally 
+ * across processors. The way to use a different algorithm for decomposing the sparse grid is given by changing the type of the Sparse grid
+ * 
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu grid definition
+ *
+ * Because the geometry is fixed we are also using the option SKIP_LABELLING. With this option active after a normal ghost_get we are able to 
+ * activate certain optimization patterns in constructions of the sending buffers and merging data.
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+//! \cond [grid definition] \endcond
+
+typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec;
+
+typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>,CudaMemory, Dec> SparseGridType;
+
+//! \cond [grid definition] \endcond
+
+void init(SparseGridType & grid, Box<3,float> & domain)
+{
+	//! \cond [create points] \endcond
+
+	typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT;
+
+	grid.addPoints([] __device__ (int i, int j, int k)
+			        {
+						return true;
+			        },
+			        [] __device__ (InsertBlockT & data, int i, int j, int k)
+			        {
+			        	data.template get<U>() = 1.0;
+			        	data.template get<V>() = 0.0;
+			        }
+			        );
+
+
+	grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+
+	//! \cond [create points] \endcond
+
+	long int x_start = grid.size(0)*1.55f/domain.getHigh(0);
+	long int y_start = grid.size(1)*1.55f/domain.getHigh(1);
+	long int z_start = grid.size(1)*1.55f/domain.getHigh(2);
+
+	long int x_stop = grid.size(0)*1.85f/domain.getHigh(0);
+	long int y_stop = grid.size(1)*1.85f/domain.getHigh(1);
+	long int z_stop = grid.size(1)*1.85f/domain.getHigh(2);
+
+	//! \cond [create points sub] \endcond
+
+	grid_key_dx<3> start({x_start,y_start,z_start});
+	grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+
+        grid.addPoints(start,stop,[] __device__ (int i, int j, int k)
+                                {
+                                                return true;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 0.5;
+                                        data.template get<V>() = 0.24;
+                                }
+                                );
+
+	grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+
+	//! \cond [create points sub] \endcond
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {256,256,256};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	float deltaT = 0.25;
+
+	// Diffusion constant for specie U
+	float du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	float dv = 1*1e-5;
+
+	// Number of timesteps
+#ifdef TEST_RUN
+	size_t timeSteps = 300;
+#else
+        size_t timeSteps = 15000;
+#endif
+
+	// K and F (Physical constant in the equation)
+    float K = 0.053;
+    float F = 0.014;
+
+	SparseGridType grid(sz,domain,g,bc);
+
+	// spacing of the grid on x and y
+	float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	float uFactor = deltaT * du/(spacing[x]*spacing[x]);
+	float vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+	auto & v_cl = create_vcluster();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps ; ++i)
+	{
+		if (v_cl.rank() == 0)
+		{std::cout << "STEP: " << i << std::endl;}
+/*		if (i % 300 == 0)
+		{
+			std::cout << "STEP: " << i << std::endl;
+			grid.write_frame("out",i,VTK_WRITER);
+		}*/
+
+		//! \cond [stencil get and use] \endcond
+
+		typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType;
+
+		//! \cond [lambda] \endcond
+
+		auto func = [uFactor,vFactor,deltaT,F,K] __device__ (float & u_out, float & v_out,
+				                                   CpBlockType & u, CpBlockType & v,
+				                                   int i, int j, int k){
+
+				float uc = u(i,j,k);
+				float vc = v(i,j,k);
+
+				u_out = uc + uFactor *(u(i-1,j,k) + u(i+1,j,k) +
+                                                       u(i,j-1,k) + u(i,j+1,k) +
+                                                       u(i,j,k-1) + u(i,j,k+1) - 6.0f*uc) - deltaT * uc*vc*vc
+                                                       - deltaT * F * (uc - 1.0f);
+
+
+				v_out = vc + vFactor *(v(i-1,j,k) + v(i+1,j,k) +
+                                                       v(i,j+1,k) + v(i,j-1,k) +
+                                                       v(i,j,k-1) + v(i,j,k+1) - 6.0f*vc) + deltaT * uc*vc*vc
+					               - deltaT * (F+K) * vc;
+				};
+
+		//! \cond [lambda] \endcond
+
+		//! \cond [body] \endcond
+
+		if (i % 2 == 0)
+		{
+			cudaDeviceSynchronize();
+			timer tconv;
+			tconv.start();
+			grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+			cudaDeviceSynchronize();
+			tconv.stop();
+			std::cout << "Conv " << tconv.getwct() << std::endl;
+
+			// After copy we synchronize again the ghost part U and V
+
+			grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING);
+		}
+		else
+		{
+			grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+			// After copy we synchronize again the ghost part U and V
+			grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+		}
+
+		//! \cond [body] \endcond
+
+		// Every 500 time step we output the configuration for
+		// visualization
+//		if (i % 500 == 0)
+//		{
+//			grid.save("output_" + std::to_string(count));
+//			count++;
+//		}
+	}
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	grid.deviceToHost<U,V,U_next,V_next>();
+	grid.write("final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile b/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9119f712c01f3220515c3e2d2cc348dabb5489d9
--- /dev/null
+++ b/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile
@@ -0,0 +1,30 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+OBJ_FLOAT = main_float.o
+gray_scott_sparse_opt_test: OPT += -DTEST_RUN
+gray_scott_sparse_opt_test: gray_scott_sparse_opt
+
+%.o: %.cpp
+	$(CC) -mavx -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
+
+gray_scott_sparse_opt: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+gray_scott_sparse_opt_float: $(OBJ_FLOAT)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_opt gray_scott_sparse_opt_float
+
+run: gray_scott_sparse_opt_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_opt
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_opt
+
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/config.cfg b/example/SparseGrid/2_gray_scott_3d_sparse_opt/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/config.cfg
rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/config.cfg
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp b/example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp
similarity index 92%
rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp
rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp
index 93d90c53f196e65f7e4779c4743ca59209dc612d..6c90b1d81f4a32e22bcebe7e9af8618d987e7a20 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp
+++ b/example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp
@@ -1,5 +1,3 @@
-
-#include "util/cuda/cuda_launch.hpp"
 #include "Grid/grid_dist_id.hpp"
 #include "data_type/aggregate.hpp"
 #include "timer.hpp"
@@ -28,7 +26,7 @@
  *
  * Two optimization has been done. The first is to change the layout to struct of arrays defining the grid with
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp grid definition
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp grid definition
  *
  * The second is using the function **conv_cross2** to calculate the right-hand-side
  * this function can be used to do a convolution that involve points in a cross stencil like in figure that involve
@@ -44,8 +42,8 @@
 
 \endverbatim
  *
- * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use float
- * we have to use Vc::float_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is
+ * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use double
+ * we have to use Vc::double_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is
  * now integrated in openfpm.
  *
  *\htmlonly
@@ -62,11 +60,11 @@
  *
  * The lambda function is defined as
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp lambda
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp lambda
  *
  * and used in the body loop
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp body
+ * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp body
  *
  * To note that instead of copy we split the properties where we are acting at every iteration
  *
@@ -202,7 +200,7 @@ int main(int argc, char* argv[])
 
 		auto func = [uFactor,vFactor,deltaT,F,K](Vc::double_v & u_out,Vc::double_v & v_out,
 				                                   Vc::double_v & u,Vc::double_v & v,
-				                                   cross_stencil_v & uc,cross_stencil_v & vc,
+				                                   cross_stencil_v<double> & uc,cross_stencil_v<double> & vc,
 				                                   unsigned char * mask){
 
 				u_out = u + uFactor *(uc.xm + uc.xp +
@@ -222,7 +220,12 @@ int main(int argc, char* argv[])
 
 		if (i % 2 == 0)
 		{
+
+			timer ts;
+			ts.start();
 			grid.conv_cross2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+			ts.stop();
+			std::cout << ts.getwct() << std::endl;
 	
 			// After copy we synchronize again the ghost part U and V
 			grid.ghost_get<U_next,V_next>();
@@ -260,7 +263,7 @@ int main(int argc, char* argv[])
 	 *
 	 * Deinitialize the library
 	 *
-	 * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp finalize
+	 * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp finalize
 	 *
 	 */
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile
similarity index 76%
rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile
rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile
index 159c4698f378d6109e77fb3c893891262d119be8..8de8ab607e994f1d656eeb1a87898b8f39bf6fcc 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile
@@ -9,7 +9,7 @@ gray_scott_sparse_cs_test: OPT += -DTEST_RUN
 gray_scott_sparse_cs_test: gray_scott_sparse_cs
 
 %.o: %.cpp
-	$(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 gray_scott_sparse_cs: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -17,7 +17,7 @@ gray_scott_sparse_cs: $(OBJ)
 all: gray_scott_sparse_cs
 
 run: gray_scott_sparse_cs_test
-	mpirun -np 4 ./gray_scott_sparse_cs
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_cs
 
 .PHONY: clean all run
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_cs/config.cfg
similarity index 100%
rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/config.cfg
rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/config.cfg
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp b/example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp
similarity index 98%
rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp
rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp
index 9b468421dfece788ba622ea1c56c0335a1f1193f..98444f92f0f165ed6f2a88b5a6d4883f791f19e5 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp
@@ -51,18 +51,18 @@
  * The initialization involve the creation of 3 sphere and one cylinder channel connecting them in order to do it we
  * create an iterator over the grid (inserted and not inserted) point with **getGridIterator**
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp init sphere channel
+ * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp init sphere channel
  *
  * After creating the domain we make a perturbation in the up sphere
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp perturbation
+ * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp perturbation
  *
  * # Boundary conditions
  *
  * For this example we use mirror on direction X Y Z If the point is missing. If the point is missing in both direction than
  * the second derivative is considered zero
  *
- * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp boundary condition
+ * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp boundary condition
  *
  */
 
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile
similarity index 65%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile
rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile
index 50373464fbc7429a088540b11eeff1bfa3dd6f29..5fde3c84c1318de6c32db3d79f46e1f5f0fe1809 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile
@@ -3,13 +3,19 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+ifdef CUDA_ON_CPU
+	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-        CUDA_CC_LINK=nvcc -ccbin=mpic++
+	ifeq (, $(shell which nvcc))
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+	else
+        	CUDA_CC=nvcc -ccbin=mpic++
+        	CUDA_CC_LINK=nvcc -ccbin=mpic++
+	endif
 endif
 
 gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
@@ -30,7 +36,7 @@ gray_scott_sparse_gpu: $(OBJ)
 all: gray_scott_sparse_gpu
 
 run: gray_scott_sparse_gpu_test
-	mpirun -np 4 ./gray_scott_sparse_gpu
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
 
 .PHONY: clean all run
 
diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu
similarity index 99%
rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu
index 529ca24572c707c5469d086d2abaa628170a9d5c..cc621f0bd8e7833f04a9e31d4628c423b6c7c486 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu
@@ -83,7 +83,7 @@ void init(sgrid_type & grid, Box<3,double> & domain)
 		                vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
 		                vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
 
-		                double distance = vp.norm() / sqrt(3);
+		                double distance = vp.norm() / sqrt(3.0f);
 
 		                // Check if the point is in the domain
 		                if (sph1.isInside(pc) || sph2.isInside(pc) || sph3.isInside(pc) || (distance < 0.1 && channel_box.isInside(pc)) )
diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..15133370060405ef2c77e59de99de4d9ecd6eac5
--- /dev/null
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile
@@ -0,0 +1,52 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+
+CUDA_CC=
+CUDA_CC_LINK=
+ifdef CUDA_ON_CPU
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+else
+        ifeq (, $(shell which nvcc))
+                CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                INCLUDE_PATH_NVCC=
+                CUDA_CC_LINK=mpic++
+                CUDA_OPTIONS=
+        else
+                CUDA_CC=nvcc -ccbin=mpic++
+                CUDA_CC_LINK=nvcc -ccbin=mpic++
+                CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+        endif
+        LIBS_SELECT=$(LIBS)
+endif
+
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71e6019c76f15121e03378b86b5a7733db0a84bd
--- /dev/null
+++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu
@@ -0,0 +1,544 @@
+//#define VCLUSTER_PERF_REPORT
+//#define SYNC_BEFORE_TAKE_TIME
+//#define ENABLE_GRID_DIST_ID_PERF_STATS
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu_cs_opt Gray Scott in 3D using sparse grids on gpu in complex geometry
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu with complex geometry
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+<table border="1" bgcolor="black">
+  <tr>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs_section.png" style="width: 500px;" />
+    </td>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs.png" style="width: 500px;" />
+    </td>
+  </tr>
+</table>
+\endhtmlonly
+ *
+ * More or less this example is the same of \ref e3_gs_gray_scott_cs on gpu using what we learned in \ref e3_gs_gray_scott_gpu
+ *
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+typedef sgrid_dist_id_gpu<3,double,aggregate<double,double,double,double> > sgrid_type;
+
+void init(sgrid_type & grid, Box<3,double> & domain)
+{
+	auto it = grid.getGridIterator();
+	Point<3,double> p[8]= {{0.35,0.35,0.35},
+	                       {0.35,2.0,2.0},
+	                       {2.0,0.35,2.0},
+	                       {2.0,2.0,0.35},
+	                       {0.35,0.35,2.0},
+	                       {0.35,2.0,0.35},
+			       {2.0,0.35,0.35},
+	                       {2.0,2.0,2.0}};
+
+	
+//	Point<3,double> u({1.0,0.0,0.0});
+//	Box<3,double> channel_box(p3,p1);
+
+	double spacing_x = grid.spacing(0);
+	double spacing_y = grid.spacing(1);
+	double spacing_z = grid.spacing(2);
+
+	typedef typename GetAddBlockType<sgrid_type>::type InsertBlockT;
+
+	// Draw spheres
+	for (int i = 0 ; i < 8 ; i++)
+	{
+		Sphere<3,double> sph(p[i],0.3);
+
+		Box<3,size_t> bx;
+
+		for (int i = 0 ; i < 3 ; i++)
+		{
+			bx.setLow(i,(size_t)((sph.center(i) - 0.31)/grid.spacing(i)));
+			bx.setHigh(i,(size_t)((sph.center(i) + 0.31)/grid.spacing(i)));
+		}
+
+		grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph] __device__ (int i, int j, int k)
+                                {
+                                                Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+						// Check if the point is in the domain
+                                		if (sph.isInside(pc) )
+                                		{return true;}
+
+                                                return false;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 1.0;
+                                        data.template get<V>() = 0.0;
+                                }
+                                );
+
+		grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+		grid.removeUnusedBuffers();
+	}
+
+	//channels
+
+	Box<3,double> b({0.25,0.25,0.25},{2.1,2.1,2.1});
+
+	for (int k = 0 ; k < 3 ; k++)
+	{
+		for (int s = 0 ; s < 2 ; s++)
+		{
+			for (int i = 0 ; i < 2 ; i++)
+        		{
+				Point<3,double> u({1.0*(((s+i)%2) == 0 && k != 2),1.0*(((s+i+1)%2) == 0 && k != 2),(k == 2)*1.0});
+				Point<3,double> c({(i == 0)?0.35:2.0,(s == 0)?0.35:2.0,(k == 0)?0.35:2.0});
+
+                		Box<3,size_t> bx;
+
+                		for (int i = 0 ; i < 3 ; i++)
+                		{
+					if (c[i] == 2.0)
+					{
+						if (u[i] == 1.0)
+						{
+                                                	bx.setLow(i,(size_t)(0.34/grid.spacing(i)));
+                                                	bx.setHigh(i,(size_t)(2.01/grid.spacing(i)));
+						}
+						else
+						{
+                                                        bx.setLow(i,(size_t)((c[i] - 0.11)/grid.spacing(i)));
+                                                        bx.setHigh(i,(size_t)((c[i] + 0.11)/grid.spacing(i)));
+						}
+					}
+					else
+					{
+						if (u[i] == 1.0)
+						{
+                        				bx.setLow(i,(size_t)(0.34/grid.spacing(i)));
+                        				bx.setHigh(i,(size_t)(2.01/grid.spacing(i)));
+						}
+						else
+						{
+                                                        bx.setLow(i,(size_t)((c[i] - 0.11)/grid.spacing(i)));
+                                                        bx.setHigh(i,(size_t)((c[i] + 0.11)/grid.spacing(i)));
+						}
+					}
+                		}
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c,b] __device__ (int i, int j, int k)
+                                	{
+						Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+                                                Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+                                                Point<3,double> vp;
+
+						// shift
+						pc -= c; 
+
+                                		// calculate the distance from the diagonal
+                                		vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+                                		vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+                                		vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+						double distance = vp.norm();
+
+                                                // Check if the point is in the domain
+                                                if (distance < 0.1 && b.isInside(pcs) == true )
+                                                {return true;}
+
+                                                return false;
+                                	},
+                                	[] __device__ (InsertBlockT & data, int i, int j, int k)
+                                	{
+                                        	data.template get<U>() = 1.0;
+                                        	data.template get<V>() = 0.0;
+                                	}
+                                );
+
+                		grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+		}
+	}
+
+	// cross channel
+	
+	int s = 0;
+	for (int s = 0 ; s < 2 ; s++)
+        {
+        	for (int i = 0 ; i < 2 ; i++)
+                {	
+			Point<3,double> c({(i == 0)?0.35:2.0,(s == 0)?0.35:2.0,0.35});
+			Point<3,double> u({(i == 0)?1.0:-1.0,(s == 0)?1.0:-1.0,1.0});
+
+			Box<3,size_t> bx;
+
+			for (int k = 0 ; k < 16; k++)
+			{
+				for (int s = 0 ; s < 3 ; s++)
+				{
+					if (u[s] > 0.0)
+					{
+						bx.setLow(s,(c[s] + k*(u[s]/9.0))/grid.spacing(s) );
+						bx.setHigh(s,(c[s] + (k+3)*(u[s]/9.0) )/ grid.spacing(s) );
+					}
+					else
+					{
+						bx.setLow(s,(c[s] + (k+3)*(u[s]/9.0) )/grid.spacing(s) );
+                                                bx.setHigh(s,(c[s] + k*(u[s]/9.0))/ grid.spacing(s) );
+					}
+				}
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c,b] __device__ (int i, int j, int k)
+        			{
+                			Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+                        		Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+                        		Point<3,double> vp;
+
+                        		// shift
+                        		pc -= c;
+
+                        		// calculate the distance from the diagonal
+                        		vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+                        		vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+                        		vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+                        		double distance = vp.norm() / sqrt(3.0);
+
+                        		// Check if the point is in the domain
+                        		if (distance < 0.1 && b.isInside(pcs) == true )
+                        		{return true;}
+
+                        		return false;
+                  		},
+                  		[] __device__ (InsertBlockT & data, int i, int j, int k)
+                  		{
+                  			data.template get<U>() = 1.0;
+                        		data.template get<V>() = 0.0;
+                  		}
+        			);
+
+				grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+		}
+
+	}
+
+	long int x_start = grid.size(0)*1.95f/domain.getHigh(0);
+	long int y_start = grid.size(1)*1.95f/domain.getHigh(1);
+	long int z_start = grid.size(1)*1.95f/domain.getHigh(2);
+
+	long int x_stop = grid.size(0)*2.05f/domain.getHigh(0);
+	long int y_stop = grid.size(1)*2.05f/domain.getHigh(1);
+	long int z_stop = grid.size(1)*2.05f/domain.getHigh(2);
+
+	grid_key_dx<3> start({x_start,y_start,z_start});
+	grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+
+        grid.addPoints(start,stop,[] __device__ (int i, int j, int k)
+                                {
+                                                return true;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 0.5;
+                                        data.template get<V>() = 0.24;
+                                }
+                                );
+
+	grid.template flush<smin_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+
+	grid.removeUnusedBuffers();
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {384,384,384};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.2;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+#ifdef TEST_RUN
+        // Number of timesteps
+        size_t timeSteps = 300;
+#else
+	// Number of timesteps
+        size_t timeSteps = 50000;
+#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	sgrid_type grid(sz,domain,g,bc);
+
+	grid.template setBackgroundValue<0>(-0.5);
+	grid.template setBackgroundValue<1>(-0.5);
+	grid.template setBackgroundValue<2>(-0.5);
+	grid.template setBackgroundValue<3>(-0.5);
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du/(spacing[0]*spacing[0]);
+	double vFactor = deltaT * dv/(spacing[0]*spacing[0]);
+
+	grid.template deviceToHost<U,V>();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps; ++i)
+	{
+		//! \cond [stencil get and use] \endcond
+
+        		typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType;
+
+        		auto func = [uFactor,vFactor,deltaT,F,K] __device__ (double & u_out, double & v_out,
+        				                                   CpBlockType & u, CpBlockType & v,
+        				                                   int i, int j, int k){
+
+        				double uc = u(i,j,k);
+        				double vc = v(i,j,k);
+
+        				double u_px = u(i+1,j,k);
+        				double u_mx = u(i-1,j,k);
+
+        				double u_py = u(i,j+1,k);
+        				double u_my = u(i,j-1,k);
+
+        				double u_pz = u(i,j,k+1);
+        				double u_mz = u(i,j,k-1);
+
+        				double v_px = v(i+1,j,k);
+        				double v_mx = v(i-1,j,k);
+
+        				double v_py = v(i,j+1,k);
+        				double v_my = v(i,j-1,k);
+
+        				double v_pz = v(i,j,k+1);
+        				double v_mz = v(i,j,k-1);
+
+        				// U fix
+
+        				if (u_mx < -0.1 && u_px < -0.1)
+        				{
+        					u_mx = uc;
+        					u_px = uc;
+        				}
+
+        				if (u_mx < -0.1)
+        				{u_mx = u_px;}
+
+        				if (u_px < -0.1)
+        				{u_px = u_mx;}
+
+        				if (u_my < -0.1 && u_py < -0.1)
+        				{
+        					u_my = uc;
+        					u_py = uc;
+        				}
+
+        				if (u_my < -0.1)
+        				{u_my = u_py;}
+
+        				if (u_py < -0.1)
+        				{u_py = u_my;}
+
+        				if (u_mz < -0.1 && u_pz < -0.1)
+        				{
+        					u_mz = uc;
+        					u_pz = uc;
+        				}
+
+        				if (u_mz < -0.1)
+        				{u_mz = u_pz;}
+
+        				if (u_pz < -0.1)
+        				{u_pz = u_mz;}
+
+        				// V fix
+
+        				if (v_mx < -0.1 && v_px < -0.1)
+        				{
+        					v_mx = uc;
+        					v_px = uc;
+        				}
+
+        				if (v_mx < -0.1)
+        				{v_mx = v_px;}
+
+        				if (v_px < -0.1)
+        				{v_px = v_mx;}
+
+        				if (v_my < -0.1 && v_py < -0.1)
+        				{
+        					v_my = uc;
+        					v_py = uc;
+        				}
+
+        				if (v_my < -0.1)
+        				{v_my = v_py;}
+
+        				if (v_py < -0.1)
+        				{v_py = v_my;}
+
+        				if (v_mz < -0.1 && v_pz < -0.1)
+        				{
+        					v_mz = uc;
+        					v_pz = uc;
+        				}
+
+        				if (v_mz < -0.1)
+        				{v_mz = v_pz;}
+
+        				if (v_pz < -0.1)
+        				{v_pz = v_mz;}
+
+        				u_out = uc + uFactor *(u_mx + u_px +
+                                                               u_my + u_py +
+                                                               u_mz + u_pz - 6.0*uc) - deltaT * uc*vc*vc
+                                                               - deltaT * F * (uc - 1.0);
+
+
+        				v_out = vc + vFactor *(v_mx + v_px +
+                                                               v_py + v_my +
+                                                               v_mz + v_pz - 6.0*vc) + deltaT * uc*vc*vc
+        					               - deltaT * (F+K) * vc;
+
+        				};
+
+        		if (i % 2 == 0)
+        		{
+        			grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+				cudaDeviceSynchronize();
+
+        			// After copy we synchronize again the ghost part U and V
+
+        			grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING);
+        		}
+        		else
+        		{
+        			grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+				cudaDeviceSynchronize();
+
+        			// After copy we synchronize again the ghost part U and V
+        			grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+        		}
+
+		//! \cond [stencil get and use] \endcond
+
+		// After copy we synchronize again the ghost part U and V
+
+		// Every 500 time step we output the configuration for
+		// visualization
+/*		if (i % 500 == 0)
+		{
+			grid.save("output_" + std::to_string(count));
+			count++;
+		}*/
+
+                std::cout << "STEP: " << i  << std::endl;
+/*                if (i % 300 == 0)
+                {
+                	grid.template deviceToHost<U,V>();
+                        grid.write_frame("out",i);
+                }*/
+	}
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	grid.print_stats();
+
+	create_vcluster().print_stats();
+
+	grid.template deviceToHost<U,V>();
+	grid.write("Final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet  SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..6239057a8dd94a2061672be96d85a91cb426ea7d
--- /dev/null
+++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile
@@ -0,0 +1,26 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+gray_scott_sparse_cs_surface: OPT += -DTEST_RUN
+gray_scott_sparse_cs_surface: gray_scott_sparse_surface_cs
+
+%.o: %.cpp
+	$(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+
+gray_scott_sparse_surface_cs: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_surface_cs
+
+run: gray_scott_sparse_cs_surface
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_surface_cs
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_surface_cs
+
diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880
--- /dev/null
+++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab932e019c638d5203b0804982e50a8b7084ad87
--- /dev/null
+++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp
@@ -0,0 +1,426 @@
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ *
+ */
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int phi = 2;
+constexpr int normal = 3;
+constexpr int tgrad_u = 4;
+constexpr int tgrad_v = 5;
+constexpr int U_next = 6;
+constexpr int V_next = 7;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef sgrid_dist_id<3,double,aggregate<double,double,double,double[3],double[3],double[3],double,double> > sgrid_type;
+
+void init(sgrid_type & grid, Box<3,double> & domain)
+{
+	//! \cond [init sphere channel] \endcond
+
+	auto it = grid.getGridIterator();
+	Point<3,double> p1({0.5,0.5,0.5});
+
+	double sx = grid.spacing(0);
+
+	Sphere<3,double> sph1(p1,0.3);
+	Sphere<3,double> sph2(p1,0.3 - sx*10);
+	Sphere<3,double> sph_zero(p1,0.3 - sx*5);
+
+	while (it.isNext())
+	{
+		// Get the local grid key
+		auto key = it.get_dist();
+		auto keyg = it.get();
+
+		Point<3,double> pc;
+		Point<3,double> vp;
+
+		for (int i = 0 ; i < 3 ; i++)
+        {pc.get(i) = keyg.get(i) * it.getSpacing(i);}
+
+		// Check if the point is in the first sphere
+		if (sph1.isInside(pc) == true && sph2.isInside(pc) == false)
+		{
+			Point<3,double> pn = pc - p1;
+			pn /= pn.norm();
+			double theta = acos(pn * Point<3,double>({0.0,0.0,1.0}));
+			Point<3,double> pn_ = pn;
+			pn_[2] = 0.0;
+			pn_ /= pn_.norm();
+			double aphi = acos(pn_ * Point<3,double>({1.0,0.0,0.0}));
+
+			// Create a perturbation in the solid angle
+			if (theta > 0.6 && theta < 0.8 && aphi > 0.0 && aphi < 0.2)
+			{
+				grid.template insert<U>(key) = 0.5;
+				grid.template insert<V>(key) = 0.25;
+			}
+			else
+			{
+				grid.template insert<U>(key) = 1.0;
+				grid.template insert<V>(key) = 0.0;
+			}
+			grid.template insert<phi>(key) = sph_zero.distance(pc);
+			grid.template insert<normal>(key)[0] = pn[0];
+			grid.template insert<normal>(key)[1] = pn[1];
+			grid.template insert<normal>(key)[2] = pn[2];
+
+			// Old values U and V
+			grid.template insert<U_next>(key) = 0.0;
+			grid.template insert<V_next>(key) = 0.0;
+		}
+
+		++it;
+	}
+
+	//! \cond [init sphere channel] \endcond
+}
+
+template<unsigned int U_src,unsigned int V_src,unsigned int U_dst, unsigned int V_dst>
+void extend(sgrid_type & grid)
+{
+	double delta = 1e-10;
+	double max = 0.0;
+	auto it = grid.getDomainIterator();
+
+	while (it.isNext())
+	{
+		// center point
+		auto Cp = it.get();
+
+		// plus,minus X,Y,Z
+		auto mx = Cp.move(0,-1);
+		auto px = Cp.move(0,+1);
+		auto my = Cp.move(1,-1);
+		auto py = Cp.move(1,1);
+		auto mz = Cp.move(2,-1);
+		auto pz = Cp.move(2,1);
+
+		double s = grid.get<phi>(Cp) / sqrt(fabs(grid.get<phi>(Cp)) + delta);
+
+		double Uext = 0.0;
+		double Vext = 0.0;
+
+		double dir = s*grid.get<normal>(Cp)[x];
+
+		if (dir > 0)
+		{
+			Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(mx));
+			Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(mx));
+		}
+		else if (dir < 0)
+		{
+			Uext += dir * (grid.get<U_src>(px) - grid.get<U_src>(Cp));
+			Vext += dir * (grid.get<V_src>(px) - grid.get<V_src>(Cp));
+		}
+
+
+		dir = s*grid.get<normal>(Cp)[y];
+		if (dir > 0)
+		{
+			Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(my));
+			Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(my));
+		}
+		else if (dir < 0)
+		{
+			Uext += dir * (grid.get<U_src>(py) - grid.get<U_src>(Cp));
+			Vext += dir * (grid.get<V_src>(py) - grid.get<V_src>(Cp));
+		}
+
+		dir = s*grid.get<normal>(Cp)[z];
+		if (dir > 0)
+		{
+			Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(mz));
+			Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(mz));
+		}
+		else if (dir < 0)
+		{
+			Uext += dir * (grid.get<U_src>(pz) - grid.get<U_src>(Cp));
+			Vext += dir * (grid.get<V_src>(pz) - grid.get<V_src>(Cp));
+		}
+
+		if (Uext >= max)
+		{
+			max = Uext;
+		}
+
+		grid.insert<U_dst>(Cp) = grid.get<U_src>(Cp) - 1.0*Uext;
+		grid.insert<V_dst>(Cp) = grid.get<V_src>(Cp) - 1.0*Vext;
+
+		// Next point in the grid
+		++it;
+	}
+
+	std::cout << "UEX max: " << max << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+    size_t sz[3] = {512,512,512};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.3;
+
+	// Diffusion constant for specie U
+	double du = 1*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 0.5*1e-5;
+
+#ifdef TEST_RUN
+        // Number of timesteps
+        size_t timeSteps = 200;
+#else
+	// Number of timesteps
+        size_t timeSteps = 150000;
+#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	sgrid_type grid(sz,domain,g,bc);
+
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	size_t count = 0;
+	grid.template ghost_get<U,V>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du;
+	double vFactor = deltaT * dv;
+
+	auto & v_cl = create_vcluster();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps ; ++i)
+	{
+		{
+		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// plus,minus X,Y,Z
+			auto mx = Cp.move(0,-1);
+			auto px = Cp.move(0,+1);
+			auto my = Cp.move(1,-1);
+			auto py = Cp.move(1,1);
+			auto mz = Cp.move(2,-1);
+			auto pz = Cp.move(2,1);
+
+			grid.insert<tgrad_u>(Cp)[0] = 0.0;
+			grid.insert<tgrad_u>(Cp)[1] = 0.0;
+			grid.insert<tgrad_u>(Cp)[2] = 0.0;
+			grid.insert<tgrad_v>(Cp)[0] = 0.0;
+			grid.insert<tgrad_v>(Cp)[1] = 0.0;
+			grid.insert<tgrad_v>(Cp)[2] = 0.0;
+
+			//! \cond [boundary condition] \endcond
+
+			if (grid.existPoint(mz) == true && grid.existPoint(pz) == true &&
+				grid.existPoint(my) == true && grid.existPoint(py) == true &&
+				grid.existPoint(mx) == true && grid.existPoint(px) == true )
+			{
+				Point<3,double> gradU;
+				gradU[x] = (grid.get<U>(Cp) - grid.get<U>(mx)) / grid.spacing(0);
+				gradU[y] = (grid.get<U>(Cp) - grid.get<U>(my)) / grid.spacing(1);
+				gradU[z] = (grid.get<U>(Cp) - grid.get<U>(mz)) / grid.spacing(2);
+
+				Point<3,double> gradV;
+				gradV[x] = (grid.get<V>(Cp) - grid.get<V>(mx)) / grid.spacing(0);
+				gradV[y] = (grid.get<V>(Cp) - grid.get<V>(my)) / grid.spacing(1);
+				gradV[z] = (grid.get<V>(Cp) - grid.get<V>(mz)) / grid.spacing(2);
+
+				Point<3,double> PgradU;
+				Point<3,double> PgradV;
+
+				PgradU.zero();
+				PgradV.zero();
+
+				for (int i = 0 ; i < 3 ; i++)
+				{
+					for (int j = 0 ; j < 3 ; j++)
+					{
+						grid.insert<tgrad_u>(Cp)[i] += (((i == j)?1.0:0.0) - grid.get<normal>(Cp)[i]*grid.get<normal>(Cp)[j])*gradU[j];
+						grid.insert<tgrad_v>(Cp)[i] += (((i == j)?1.0:0.0) - grid.get<normal>(Cp)[i]*grid.get<normal>(Cp)[j])*gradV[j];
+					}
+				}
+			}
+			++it;
+		}
+		}
+
+//		Old.write_frame("Init_condition",i);
+
+		{
+		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// plus,minus X,Y,Z
+			auto mx = Cp.move(0,-1);
+			auto px = Cp.move(0,+1);
+			auto my = Cp.move(1,-1);
+			auto py = Cp.move(1,1);
+			auto mz = Cp.move(2,-1);
+			auto pz = Cp.move(2,1);
+
+			//! \cond [boundary condition] \endcond
+
+			// Mirror z
+
+			if (grid.existPoint(mz) == true && grid.existPoint(pz) == true &&
+				grid.existPoint(my) == true && grid.existPoint(py) == true &&
+				grid.existPoint(mx) == true && grid.existPoint(px) == true )
+			{
+				double lapU = 0;
+				double lapV = 0;
+
+				//Div
+				lapU += (grid.get<tgrad_u>(px)[0] - grid.get<tgrad_u>(Cp)[0]) / grid.spacing(0);
+				lapV += (grid.get<tgrad_v>(px)[0] - grid.get<tgrad_v>(Cp)[0]) / grid.spacing(0);
+				lapU += (grid.get<tgrad_u>(py)[1] - grid.get<tgrad_u>(Cp)[1]) / grid.spacing(1);
+				lapV += (grid.get<tgrad_v>(py)[1] - grid.get<tgrad_v>(Cp)[1]) / grid.spacing(1);
+				lapU += (grid.get<tgrad_u>(pz)[2] - grid.get<tgrad_u>(Cp)[2]) / grid.spacing(2);
+				lapV += (grid.get<tgrad_v>(pz)[2] - grid.get<tgrad_v>(Cp)[2]) / grid.spacing(2);
+
+				// update based on Eq 2
+				grid.insert<U_next>(Cp) = grid.get<U>(Cp) + uFactor * lapU +
+											- deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) +
+											- deltaT * F * (grid.get<U>(Cp) - 1.0);
+
+
+				// update based on Eq 2
+				grid.insert<V_next>(Cp) = grid.get<V>(Cp) + vFactor * lapV +
+											deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) +
+											- deltaT * (F+K) * grid.get<V>(Cp);
+			}
+
+			// Next point in the grid
+			++it;
+		}
+		}
+
+//		New.write_frame("update",i);
+
+		// Extend
+
+		if (i % 5 == 0)
+		{
+		for (int j = 0 ; j < 2 ; j++)
+		{
+			if (j % 2 == 0)
+			{extend<U_next,V_next,U,V>(grid);}
+			else
+			{extend<U,V,U_next,V_next>(grid);}
+
+			// Here we copy New into the old grid in preparation of the new step
+			// It would be better to alternate, but using this we can show the usage
+			// of the function copy. To note that copy work only on two grid of the same
+			// decomposition. If you want to copy also the decomposition, or force to be
+			// exactly the same, use Old = New
+			//New.copy_sparse(Old);
+		}
+		}
+
+/*		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// update based on Eq 2
+			grid.insert<U>(Cp) = grid.get<U_next>(Cp);
+			grid.insert<V>(Cp) = grid.get<V_next>(Cp);
+
+			++it;
+		}*/
+
+		//! \cond [stencil get and use] \endcond
+
+		// After copy we synchronize again the ghost part U and V
+		grid.ghost_get<U,V>();
+
+		// Every 500 time step we output the configuration for
+		// visualization
+		if (i % 500 == 0)
+		{
+			grid.save("output_" + std::to_string(count));
+			count++;
+		}
+
+		if (v_cl.rank() == 0)
+		{std::cout << "STEP: " << i  << "   " << std::endl;}
+		if (i % 100 == 0)
+		{
+			grid.write_frame("out",i);
+		}
+	}
+
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42f2b8bf960d023c4ba81edb7a4dec7d8fe0e3aa
--- /dev/null
+++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile
@@ -0,0 +1,26 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+gray_scott_sparse_surface_cs_test: OPT += -DTEST_RUN
+gray_scott_sparse_surface_cs_test: gray_scott_sparse_surface_cs
+
+%.o: %.cpp
+	$(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
+
+gray_scott_sparse_surface_cs: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_surface_cs
+
+run: gray_scott_sparse_surface_cs_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_surface_cs
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_surface_cs
+
diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880
--- /dev/null
+++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c77eb62e416fb9bbf362ec0c3dffdfa5f1bdfb7
--- /dev/null
+++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp
@@ -0,0 +1,472 @@
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ */
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int phi = 2;
+constexpr int normal = 3;
+constexpr int tgrad_u = 4;
+constexpr int tgrad_v = 5;
+constexpr int U_next = 6;
+constexpr int V_next = 7;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef sgrid_dist_soa<3,double,aggregate<double,double,double,double[3],double[3],double[3],double,double> > sgrid_type;
+
+void init(sgrid_type & grid, Box<3,double> & domain)
+{
+	//! \cond [init sphere channel] \endcond
+
+	auto it = grid.getGridIterator();
+	Point<3,double> p1({0.5,0.5,0.5});
+
+	double sx = grid.spacing(0);
+
+	Sphere<3,double> sph1(p1,0.3);
+	Sphere<3,double> sph2(p1,0.3 - sx*10);
+	Sphere<3,double> sph_zero(p1,0.3 - sx*5);
+
+	while (it.isNext())
+	{
+		// Get the local grid key
+		auto key = it.get_dist();
+		auto keyg = it.get();
+
+		Point<3,double> pc;
+		Point<3,double> vp;
+
+		for (int i = 0 ; i < 3 ; i++)
+        {pc.get(i) = keyg.get(i) * it.getSpacing(i);}
+
+		// Check if the point is in the first sphere
+		if (sph1.isInside(pc) == true && sph2.isInside(pc) == false)
+		{
+			Point<3,double> pn = pc - p1;
+			pn /= pn.norm();
+			double theta = acos(pn * Point<3,double>({0.0,0.0,1.0}));
+			Point<3,double> pn_ = pn;
+			pn_[2] = 0.0;
+			pn_ /= pn_.norm();
+			double aphi = acos(pn_ * Point<3,double>({1.0,0.0,0.0}));
+
+			// Create a perturbation in the solid angle
+			if (theta > 0.6 && theta < 0.8 && aphi > 0.0 && aphi < 0.2)
+			{
+				grid.template insert<U>(key) = 0.5;
+				grid.template insert<V>(key) = 0.25;
+			}
+			else
+			{
+				grid.template insert<U>(key) = 1.0;
+				grid.template insert<V>(key) = 0.0;
+			}
+			grid.template insert<phi>(key) = sph_zero.distance(pc);
+			grid.template insert<normal>(key)[0] = pn[0];
+			grid.template insert<normal>(key)[1] = pn[1];
+			grid.template insert<normal>(key)[2] = pn[2];
+
+			// Old values U and V
+			grid.template insert<U_next>(key) = 0.0;
+			grid.template insert<V_next>(key) = 0.0;
+		}
+
+		++it;
+	}
+
+	//! \cond [init sphere channel] \endcond
+}
+
+template<unsigned int U_src,unsigned int V_src,unsigned int U_dst, unsigned int V_dst>
+void extend(sgrid_type & grid, size_t (& sz)[3],double (& spacing)[3])
+{
+	double delta = 1e-10;
+	double max = 0.0;
+
+	auto func_extend = [delta,&spacing](auto & grid, auto & ids,
+	                                 unsigned char * mask_sum)
+									 {
+										Vc::double_v phi_c;
+										Vc::double_v s;
+
+										Vc::double_v Uext = 0.0;
+										Vc::double_v Vext = 0.0;
+
+										Vc::double_v n[3];
+										Vc::double_v dir;
+
+										Vc::double_v Uc;
+										Vc::double_v Vc;
+										Vc::double_v Uc_xm;
+										Vc::double_v Vc_xm;
+										Vc::double_v Uc_ym;
+										Vc::double_v Vc_ym;
+										Vc::double_v Uc_zm;
+										Vc::double_v Vc_zm;
+
+										Vc::double_v Uc_xp;
+										Vc::double_v Vc_xp;
+										Vc::double_v Uc_yp;
+										Vc::double_v Vc_yp;
+										Vc::double_v Uc_zp;
+										Vc::double_v Vc_zp;
+
+										load_crs<x,0,phi>(phi_c,grid,ids);
+										load_crs_v<x,0,x,normal>(n[x],grid,ids);
+										load_crs_v<x,0,y,normal>(n[y],grid,ids);
+										load_crs_v<x,0,z,normal>(n[z],grid,ids);
+
+										load_crs<x,0,U_src>(Uc,grid,ids);
+										load_crs<x,0,V_src>(Vc,grid,ids);
+										load_crs<x,-1,U_src>(Uc_xm,grid,ids);
+										load_crs<x,-1,V_src>(Vc_xm,grid,ids);
+										load_crs<y,-1,U_src>(Uc_ym,grid,ids);
+										load_crs<y,-1,V_src>(Vc_ym,grid,ids);
+										load_crs<z,-1,U_src>(Uc_zm,grid,ids);
+										load_crs<z,-1,V_src>(Vc_zm,grid,ids);
+										load_crs<x,1,U_src>(Uc_xp,grid,ids);
+										load_crs<x,1,V_src>(Vc_xp,grid,ids);
+										load_crs<y,1,U_src>(Uc_yp,grid,ids);
+										load_crs<y,1,V_src>(Vc_yp,grid,ids);
+										load_crs<z,1,U_src>(Uc_zp,grid,ids);
+										load_crs<z,1,V_src>(Vc_zp,grid,ids);
+
+										s = phi_c / sqrt(phi_c*phi_c + delta*delta);
+
+										dir = s*n[0];
+										auto dir_pos = dir > 0;
+										auto dir_neg = dir < 0;
+
+										Uext += Vc::iif(dir_pos,dir * (Uc - Uc_xm)/spacing[0],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_pos,dir * (Vc - Vc_xm)/spacing[0],Vc::double_v(0.0));
+										Uext += Vc::iif(dir_neg,dir * (Uc_xp - Uc)/spacing[0],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_neg,dir * (Vc_xp - Vc)/spacing[0],Vc::double_v(0.0));
+
+										dir = s*n[1];
+										dir_pos = dir > 0;
+										dir_neg = dir < 0;
+
+										Uext += Vc::iif(dir_pos,dir * (Uc - Uc_ym)/spacing[1],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_pos,dir * (Vc - Vc_ym)/spacing[1],Vc::double_v(0.0));
+										Uext += Vc::iif(dir_neg,dir * (Uc_yp - Uc)/spacing[1],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_neg,dir * (Vc_yp - Vc)/spacing[1],Vc::double_v(0.0));
+
+										dir = s*n[2];
+										dir_pos = dir > 0;
+										dir_neg = dir < 0;
+
+										Uext += Vc::iif(dir_pos,dir * (Uc - Uc_zm)/spacing[2],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_pos,dir * (Vc - Vc_zm)/spacing[2],Vc::double_v(0.0));
+										Uext += Vc::iif(dir_neg,dir * (Uc_zp - Uc)/spacing[2],Vc::double_v(0.0));
+										Vext += Vc::iif(dir_neg,dir * (Vc_zp - Vc)/spacing[2],Vc::double_v(0.0));
+
+										Uext = Uc - 0.0003*Uext;
+										Vext = Vc - 0.0003*Vext;
+
+										store_crs<U_dst>(grid,Uext,ids);
+										store_crs<V_dst>(grid,Vext,ids);
+									 };
+
+		grid.template conv_cross_ids<1,double>({0,0,0},{sz[0] - 1, sz[1] - 1, sz[2] - 1},func_extend);
+}
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+    size_t sz[3] = {512,512,512};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.3;
+
+	// Diffusion constant for specie U
+	double du = 1*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 0.5*1e-5;
+
+#ifdef TEST_RUN
+        // Number of timesteps
+        size_t timeSteps = 200;
+#else
+	// Number of timesteps
+        size_t timeSteps = 100000;
+#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	sgrid_type grid(sz,domain,g,bc);
+
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	size_t count = 0;
+	grid.template ghost_get<U,V>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du;
+	double vFactor = deltaT * dv;
+
+	auto & v_cl = create_vcluster();
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps ; ++i)
+	{
+		auto func_grad = [&spacing](auto & grid, auto & ids,
+                                 unsigned char * mask_sum){
+
+												Vc::double_v n[3];
+
+												Vc::double_v Uc;
+												Vc::double_v xmU;
+												Vc::double_v ymU;
+												Vc::double_v zmU;
+
+												Vc::double_v Vc;
+												Vc::double_v xmV;
+												Vc::double_v ymV;
+												Vc::double_v zmV;
+
+												Vc::double_v u_out[3];
+												Vc::double_v v_out[3];
+
+												load_crs<x,-1,U>(xmU,grid,ids);
+												load_crs<y,-1,U>(ymU,grid,ids);
+												load_crs<z,-1,U>(zmU,grid,ids);
+												load_crs<x,0,U>(Uc,grid,ids);
+
+												load_crs<x,-1,V>(xmV,grid,ids);
+												load_crs<y,-1,V>(ymV,grid,ids);
+												load_crs<z,-1,V>(zmV,grid,ids);
+												load_crs<x,0,V>(Vc,grid,ids);
+
+												load_crs_v<x,0,x,normal>(n[x],grid,ids);
+												load_crs_v<x,0,y,normal>(n[y],grid,ids);
+												load_crs_v<x,0,z,normal>(n[z],grid,ids);
+
+												u_out[0] = (1.0-n[0]*n[0])*(Uc - xmU)/spacing[0]  + (-n[1]*n[1])*(Uc - ymU)/spacing[1]    + (-n[2]*n[2])*(Uc - zmU)/spacing[2];
+												u_out[1] = (-n[0]*n[0])*(Uc - xmU)/spacing[0]    + (1.0-n[1]*n[1])*(Uc - ymU)/spacing[1] + (-n[2]*n[2])*(Uc - zmU)/spacing[2];
+												u_out[2] = (-n[0]*n[0])*(Uc - xmU)/spacing[0]    + (-n[1]*n[1])*(Uc - ymU)/spacing[1]    + (1.0-n[2]*n[2])*(Uc - zmU)/spacing[2];
+
+												v_out[0] = (1.0-n[0]*n[0])*(Vc - xmV)/spacing[0] + (-n[1]*n[1])*(Vc - ymV)/spacing[1]    + (-n[2]*n[2])*(Vc - zmV)/spacing[2];
+												v_out[1] = (-n[0]*n[0])*(Vc - xmV)/spacing[0]    + (1.0-n[1]*n[1])*(Vc - ymV)/spacing[1] + (-n[2]*n[2])*(Vc - zmV)/spacing[2];
+												v_out[2] = (-n[0]*n[0])*(Vc - xmV)/spacing[0]    + (-n[1]*n[1])*(Vc - ymV)/spacing[1]    + (1.0-n[2]*n[2])*(Vc - zmV)/spacing[2];
+
+												Vc::Mask<double> surround;
+
+												for (int i = 0 ; i < Vc::double_v::Size ; i++)
+												{surround[i] = (mask_sum[i] == 6);}
+
+												u_out[0] = Vc::iif(surround,u_out[0],Vc::double_v(0.0));
+												u_out[1] = Vc::iif(surround,u_out[1],Vc::double_v(0.0));
+												u_out[2] = Vc::iif(surround,u_out[2],Vc::double_v(0.0));
+
+												v_out[0] = Vc::iif(surround,v_out[0],Vc::double_v(0.0));
+												v_out[1] = Vc::iif(surround,v_out[1],Vc::double_v(0.0));
+												v_out[2] = Vc::iif(surround,v_out[2],Vc::double_v(0.0));
+
+												store_crs_v<tgrad_u,x>(grid,u_out[0],ids);
+												store_crs_v<tgrad_u,y>(grid,u_out[1],ids);
+												store_crs_v<tgrad_u,z>(grid,u_out[2],ids);
+
+												store_crs_v<tgrad_v,x>(grid,v_out[0],ids);
+												store_crs_v<tgrad_v,y>(grid,v_out[1],ids);
+												store_crs_v<tgrad_v,z>(grid,v_out[2],ids);
+											};
+
+		grid.template conv_cross_ids<1,double>({0,0,0},{sz[0]-1,sz[1] - 1,sz[2] - 1},func_grad);
+
+		auto func_lap = [&spacing,uFactor,vFactor,deltaT,K,F](auto & grid, auto & ids,
+                                 unsigned char * mask_sum){
+
+												Vc::double_v gradU_px;
+												Vc::double_v gradU_py;
+												Vc::double_v gradU_pz;
+
+												Vc::double_v gradU_x;
+												Vc::double_v gradU_y;
+												Vc::double_v gradU_z;
+
+												Vc::double_v gradV_px;
+												Vc::double_v gradV_py;
+												Vc::double_v gradV_pz;
+
+												Vc::double_v gradV_x;
+												Vc::double_v gradV_y;
+												Vc::double_v gradV_z;
+
+												Vc::double_v lapU;
+												Vc::double_v lapV;
+
+												Vc::double_v Uc;
+												Vc::double_v Vc;
+
+												Vc::double_v outU;
+												Vc::double_v outV;
+
+												load_crs_v<x,1,x,tgrad_u>(gradU_px,grid,ids);
+												load_crs_v<y,1,y,tgrad_u>(gradU_py,grid,ids);
+												load_crs_v<z,1,z,tgrad_u>(gradU_pz,grid,ids);
+
+												load_crs_v<x,0,x,tgrad_u>(gradU_x,grid,ids);
+												load_crs_v<x,0,y,tgrad_u>(gradU_y,grid,ids);
+												load_crs_v<x,0,z,tgrad_u>(gradU_z,grid,ids);
+
+												load_crs_v<x,1,x,tgrad_v>(gradV_px,grid,ids);
+												load_crs_v<y,1,y,tgrad_v>(gradV_py,grid,ids);
+												load_crs_v<z,1,z,tgrad_v>(gradV_pz,grid,ids);
+
+												load_crs_v<x,0,x,tgrad_v>(gradV_x,grid,ids);
+												load_crs_v<x,0,y,tgrad_v>(gradV_y,grid,ids);
+												load_crs_v<x,0,z,tgrad_v>(gradV_z,grid,ids);
+
+												load_crs<x,0,U>(Uc,grid,ids);
+												load_crs<x,0,V>(Vc,grid,ids);
+
+												lapU += (gradU_px - gradU_x) / spacing[0];
+												lapV += (gradV_px - gradV_x) / spacing[0];
+												lapU += (gradU_py - gradU_y) / spacing[1];
+												lapV += (gradV_py - gradV_y) / spacing[1];
+												lapU += (gradU_pz - gradU_z) / spacing[2];
+												lapV += (gradV_pz - gradV_z) / spacing[2];
+
+												// update based on Eq 2
+												outU = Uc + uFactor * lapU +
+																			- deltaT * Uc * Vc * Vc +
+																			- deltaT * F * (Uc - 1.0);
+
+
+												// update based on Eq 2
+												outV = Vc + vFactor * lapV +
+																			deltaT * Uc * Vc * Vc +
+																			- deltaT * (F+K) * Vc;
+
+												Vc::Mask<double> surround;
+
+												for (int i = 0 ; i < Vc::double_v::Size ; i++)
+												{surround[i] = (mask_sum[i] == 6);}
+
+
+												outU = Vc::iif(surround,outU,Uc);
+												outV = Vc::iif(surround,outV,Vc);
+
+												store_crs<U_next>(grid,outU,ids);
+												store_crs<V_next>(grid,outV,ids);
+											};
+
+		grid.template conv_cross_ids<1,double>({0,0,0},{sz[0]-1,sz[1] - 1,sz[2] - 1},func_lap);
+
+//		New.write_frame("update",i);
+
+		// Extend
+
+		if (i % 5 == 0)
+		{
+		for (int j = 0 ; j < 2 ; j++)
+		{
+			if (j % 2 == 0)
+			{extend<U_next,V_next,U,V>(grid,sz,spacing);}
+			else
+			{extend<U,V,U_next,V_next>(grid,sz,spacing);}
+
+			// Here we copy New into the old grid in preparation of the new step
+			// It would be better to alternate, but using this we can show the usage
+			// of the function copy. To note that copy work only on two grid of the same
+			// decomposition. If you want to copy also the decomposition, or force to be
+			// exactly the same, use Old = New
+			//New.copy_sparse(Old);
+		}
+		}
+
+/*		auto it = grid.getDomainIterator();
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// update based on Eq 2
+			grid.insert<U>(Cp) = grid.get<U_next>(Cp);
+			grid.insert<V>(Cp) = grid.get<V_next>(Cp);
+
+			++it;
+		}*/
+
+		//! \cond [stencil get and use] \endcond
+
+		// After copy we synchronize again the ghost part U and V
+		grid.ghost_get<U,V>();
+
+		// Every 500 time step we output the configuration for
+		// visualization
+		if (i % 500 == 0)
+		{
+//			grid.save("output_" + std::to_string(count));
+			count++;
+		}
+
+		if (v_cl.rank() == 0)
+		{std::cout << "STEP: " << i  << "   " << std::endl;}
+		if (i % 1000 == 0)
+		{
+			grid.write_frame("out",i);
+		}
+	}
+
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..25e2b3240d812248d2fb5edb4856844f86f296b8
--- /dev/null
+++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile
@@ -0,0 +1,45 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifdef CUDA_ON_CPU
+	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+else
+	ifeq (, $(shell which nvcc))
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+	else
+        	CUDA_CC=nvcc -ccbin=mpic++
+        	CUDA_CC_LINK=nvcc -ccbin=mpic++
+	endif
+endif
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0c2804c4b0db46c709a2e3bfcb780f63199e6b4
--- /dev/null
+++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu
@@ -0,0 +1,504 @@
+#include "Decomposition/Distribution/BoxDistribution.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * More or less this example is the adaptation of the dense example in 3D
+ *
+ * \see \ref Grid_3_gs_3D
+ *
+ * # Initializetion
+ *
+ * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D)
+ * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is
+ * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted.
+ * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k
+ *
+ * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters
+ * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max
+ * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points
+ *
+ * The function can also called with a specified range
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub
+ *
+ * # Update
+ *
+ * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve
+ * two properties
+ *
+ * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen.
+ *
+ * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the
+ * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the
+ * last is the extension of the stencil. In this case we use 1.
+ *
+ * The lambda function is defined as
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda
+ *
+ * and used in the body loop
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec;
+
+typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>,CudaMemory, Dec> SparseGridType;
+
+void init(SparseGridType & grid, Box<3,float> & domain, size_t (& div)[3])
+{
+	//! \cond [create points] \endcond
+
+	double spacing_x = grid.spacing(0);
+	double spacing_y = grid.spacing(1);
+	double spacing_z = grid.spacing(2);
+
+	typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT;
+
+	// Get the processor domain in continuos
+
+	for (int i = 0 ; i < div[0] ; i++)
+	{
+		for (int j = 0 ; j < div[1] ; j++)
+		{
+			for (int k = 0 ; k < div[2] ; k++)
+			{
+				Point<3,double> p({0.5+i*1.0,0.5+j*1.0,0.5+k*1.0});
+				Sphere<3,double> sph(p,0.3);
+
+				Box<3,size_t> bx;
+
+				for (int s = 0 ; s < 3 ; s++)
+				{
+					bx.setLow(s,(size_t)((sph.center(s) - 0.31)/grid.spacing(s)));
+					bx.setHigh(s,(size_t)((sph.center(s) + 0.31)/grid.spacing(s)));
+				}
+
+				grid.addPoints([spacing_x,spacing_y,spacing_z,sph] __device__ (int i, int j, int k)
+			        	{
+							Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+							// Check if the point is in the domain
+											if (sph.isInside(pc) )
+											{return true;}
+											return false;
+			        	},
+			        	[] __device__ (InsertBlockT & data, int i, int j, int k)
+			        	{
+			        		data.template get<U>() = 1.0;
+			        		data.template get<V>() = 0.0;
+			        	}
+					);
+					
+				grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+		}
+	}
+
+	for (int i = 0 ; i < div[0] ; i++)
+	{
+			for (int j = 0 ; j < div[1] ; j++)
+			{
+				Point<3,double> u({0.0,0.0,1.0});
+				Point<3,double> c({0.5+i,0.5+j,0.0});
+
+				Box<3,size_t> bx;
+
+				bx.setLow(0,(0.4+i)/spacing_x);
+				bx.setHigh(0,(0.6+i)/spacing_x);
+
+				bx.setLow(1,(0.4+j)/spacing_y);
+				bx.setHigh(1,(0.6+j)/spacing_y);
+
+				bx.setLow(2,0);
+				bx.setHigh(2,(size_t)grid.size(2));
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k)
+									{
+						Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+												Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+												Point<3,double> vp;
+
+						// shift
+						pc -= c; 
+
+										// calculate the distance from the diagonal
+										vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+										vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+										vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+						double distance = vp.norm();
+
+												// Check if the point is in the domain
+												if (distance < 0.1 )
+												{return true;}
+
+												return false;
+									},
+									[] __device__ (InsertBlockT & data, int i, int j, int k)
+									{
+											data.template get<U>() = 1.0;
+											data.template get<V>() = 0.0;
+									}
+								);
+
+				grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+	}
+
+	for (int i = 0 ; i < div[0] ; i++)
+	{
+			for (int k = 0 ; k < div[2] ; k++)
+			{
+				Point<3,double> u({0.0,1.0,0.0});
+				Point<3,double> c({0.5+i,0.0,0.5+k});
+
+				Box<3,size_t> bx;
+
+				bx.setLow(0,(0.4+i)/spacing_x);
+				bx.setHigh(0,(0.6+i)/spacing_x);
+
+				bx.setLow(2,(0.4+k)/spacing_z);
+				bx.setHigh(2,(0.6+k)/spacing_z);
+
+				bx.setLow(1,0);
+				bx.setHigh(1,(size_t)grid.size(1));
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k)
+									{
+						Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+												Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+												Point<3,double> vp;
+
+						// shift
+						pc -= c; 
+
+										// calculate the distance from the diagonal
+										vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+										vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+										vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+						double distance = vp.norm();
+
+												// Check if the point is in the domain
+												if (distance < 0.1 )
+												{return true;}
+
+												return false;
+									},
+									[] __device__ (InsertBlockT & data, int i, int j, int k)
+									{
+											data.template get<U>() = 1.0;
+											data.template get<V>() = 0.0;
+									}
+								);
+
+				grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+	}
+
+	for (int j = 0 ; j < div[1] ; j++)
+	{
+			for (int k = 0 ; k < div[2] ; k++)
+			{
+				Point<3,double> u({1.0,0.0,0.0});
+				Point<3,double> c({0.0,0.5+j,0.5+k});
+
+				Box<3,size_t> bx;
+
+				bx.setLow(1,(0.4+j)/spacing_y);
+				bx.setHigh(1,(0.6+j)/spacing_y);
+
+				bx.setLow(2,(0.4+k)/spacing_z);
+				bx.setHigh(2,(0.6+k)/spacing_z);
+
+				bx.setLow(0,0);
+				bx.setHigh(0,(size_t)grid.size(0));
+
+				grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k)
+									{
+						Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+												Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z});
+												Point<3,double> vp;
+
+						// shift
+						pc -= c; 
+
+										// calculate the distance from the diagonal
+										vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1);
+										vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2);
+										vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0);
+
+						double distance = vp.norm();
+
+												// Check if the point is in the domain
+												if (distance < 0.1 )
+												{return true;}
+
+												return false;
+									},
+									[] __device__ (InsertBlockT & data, int i, int j, int k)
+									{
+											data.template get<U>() = 1.0;
+											data.template get<V>() = 0.0;
+									}
+								);
+
+				grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+				grid.removeUnusedBuffers();
+			}
+	}
+
+	//! \cond [create points] \endcond
+
+	long int x_start = grid.size(0)*0.4f/domain.getHigh(0);
+	long int y_start = grid.size(1)*0.4f/domain.getHigh(1);
+	long int z_start = grid.size(1)*0.4f/domain.getHigh(2);
+
+	long int x_stop = grid.size(0)*0.6f/domain.getHigh(0);
+	long int y_stop = grid.size(1)*0.6f/domain.getHigh(1);
+	long int z_stop = grid.size(1)*0.6f/domain.getHigh(2);
+
+	//! \cond [create points sub] \endcond
+
+	grid_key_dx<3> start({x_start,y_start,z_start});
+	grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+
+        grid.addPoints(start,stop,[] __device__ (int i, int j, int k)
+                                {
+                                                return true;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 0.5;
+                                        data.template get<V>() = 0.24;
+                                }
+                                );
+
+	grid.template flush<smin_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+
+	//! \cond [create points sub] \endcond
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// First we check which type of decomposition BoxDistritubion prodice
+	auto & v_cl = create_vcluster();
+	
+	openfpm::vector<int> facts;
+	getPrimeFactors(v_cl.size(),facts);
+
+	size_t div[3];
+
+	for (int i = 0 ; i < 3 ; i++)
+	{div[i] = 1;}
+
+	for (int i = 0 ; i < facts.size() ; i++)
+	{div[i % 3] *= facts.get(i);}
+
+	grid_sm<3,void> gdist(div);
+
+	// domain
+	Box<3,float> domain({0.0,0.0,0.0},{div[0]*1.0f,div[1]*1.0f,div[2]*1.0f});
+	
+	// grid size
+	size_t sz[3] = {64*div[0],64*div[1],64*div[2]};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	float deltaT = 0.25;
+
+	// Diffusion constant for specie U
+	float du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	float dv = 1*1e-5;
+
+	// Number of timesteps
+#ifdef TEST_RUN
+	size_t timeSteps = 300;
+#else
+        size_t timeSteps = 15000;
+#endif
+
+	// K and F (Physical constant in the equation)
+    float K = 0.053;
+    float F = 0.014;
+
+	SparseGridType grid(sz,domain,g,bc,0,gdist);
+
+	// spacing of the grid on x and y
+	float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain,div);
+
+        grid.deviceToHost<U,V,U_next,V_next>();
+        grid.write("final");
+
+	openfpm_finalize();
+	return 0;
+
+	// sync the ghost
+	grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	float uFactor = deltaT * du/(spacing[x]*spacing[x]);
+	float vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+	timer tot_sim;
+	tot_sim.start();
+
+	for (size_t i = 0; i < timeSteps ; ++i)
+	{
+		if (v_cl.rank() == 0)
+		{std::cout << "STEP: " << i << std::endl;}
+/*		if (i % 300 == 0)
+		{
+			std::cout << "STEP: " << i << std::endl;
+			grid.write_frame("out",i,VTK_WRITER);
+		}*/
+
+		//! \cond [stencil get and use] \endcond
+
+		typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType;
+
+		//! \cond [lambda] \endcond
+
+		auto func = [uFactor,vFactor,deltaT,F,K] __device__ (float & u_out, float & v_out,
+				                                   CpBlockType & u, CpBlockType & v,
+				                                   int i, int j, int k){
+
+				float uc = u(i,j,k);
+				float vc = v(i,j,k);
+
+				u_out = uc + uFactor *(u(i-1,j,k) + u(i+1,j,k) +
+                                                       u(i,j-1,k) + u(i,j+1,k) +
+                                                       u(i,j,k-1) + u(i,j,k+1) - 6.0f*uc) - deltaT * uc*vc*vc
+                                                       - deltaT * F * (uc - 1.0f);
+
+
+				v_out = vc + vFactor *(v(i-1,j,k) + v(i+1,j,k) +
+                                                       v(i,j+1,k) + v(i,j-1,k) +
+                                                       v(i,j,k-1) + v(i,j,k+1) - 6.0f*vc) + deltaT * uc*vc*vc
+					               - deltaT * (F+K) * vc;
+				};
+
+		//! \cond [lambda] \endcond
+
+		//! \cond [body] \endcond
+
+		if (i % 2 == 0)
+		{
+			grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+			// After copy we synchronize again the ghost part U and V
+
+			grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING);
+		}
+		else
+		{
+			grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+			// After copy we synchronize again the ghost part U and V
+			grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+		}
+
+		//! \cond [body] \endcond
+
+		// Every 500 time step we output the configuration for
+		// visualization
+//		if (i % 500 == 0)
+//		{
+//			grid.save("output_" + std::to_string(count));
+//			count++;
+//		}
+	}
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	grid.deviceToHost<U,V,U_next,V_next>();
+	grid.write("final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..15133370060405ef2c77e59de99de4d9ecd6eac5
--- /dev/null
+++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile
@@ -0,0 +1,52 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+
+CUDA_CC=
+CUDA_CC_LINK=
+ifdef CUDA_ON_CPU
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+else
+        ifeq (, $(shell which nvcc))
+                CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                INCLUDE_PATH_NVCC=
+                CUDA_CC_LINK=mpic++
+                CUDA_OPTIONS=
+        else
+                CUDA_CC=nvcc -ccbin=mpic++
+                CUDA_CC_LINK=nvcc -ccbin=mpic++
+                CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+        endif
+        LIBS_SELECT=$(LIBS)
+endif
+
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..04904f3933a23d4ad7372cc480a98b8759934c26
--- /dev/null
+++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu
@@ -0,0 +1,284 @@
+#define SYNC_BEFORE_TAKE_TIME
+#include "Decomposition/Distribution/BoxDistribution.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu with complex geometry
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+<table border="1" bgcolor="black">
+  <tr>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs_section.png" style="width: 500px;" />
+    </td>
+    <td>
+      <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs.png" style="width: 500px;" />
+    </td>
+  </tr>
+</table>
+\endhtmlonly
+ *
+ * More or less this example is the same of \ref e3_gs_gray_scott_cs on gpu using what we learned in \ref e3_gs_gray_scott_gpu
+ *
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+typedef CartDecomposition<3,double, CudaMemory, memory_traits_inte, BoxDistribution<3,double> > Dec;
+
+typedef sgrid_dist_id_gpu<3,double,aggregate<double,double,double,double>, CudaMemory,Dec > sgrid_type;
+
+void draw_oscillation_shock(sgrid_type & grid, Box<3,double> & domain)
+{
+	auto it = grid.getGridIterator();
+	Point<3,double> p({1.25,1.25,1.25});
+
+	
+//	Point<3,double> u({1.0,0.0,0.0});
+//	Box<3,double> channel_box(p3,p1);
+
+	double spacing_x = grid.spacing(0);
+	double spacing_y = grid.spacing(1);
+	double spacing_z = grid.spacing(2);
+
+	typedef typename GetAddBlockType<sgrid_type>::type InsertBlockT;
+
+	// Draw a shock expanding from 0.4 to 0.8 and than contracting from 0.8 to 0.4
+	for (int i = 0 ; i < 100 ; i++)
+	{
+		Sphere<3,double> sph(p,0.2 + (double)i/160.0);
+		Sphere<3,double> sph2(p,0.4 + (double)i/160.0);
+
+		Box<3,size_t> bx;
+
+		for (int j = 0 ; j < 3 ; j++)
+		{
+			bx.setLow(j,(size_t)((sph.center(j) - 0.4 - (double)i/160.0)/grid.spacing(j)));
+			bx.setHigh(j,(size_t)((sph.center(j) + 0.4 + (double)i/160.0)/grid.spacing(j)));
+		}
+
+		timer t_add;
+		t_add.start();
+
+		grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph,sph2] __device__ (int i, int j, int k)
+                                {
+                                                Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+						// Check if the point is in the domain
+                                		if (sph2.isInside(pc) )
+                                		{
+							if (sph.isInside(pc) == false)
+							{return true;}
+						}
+
+                                                return false;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 1.0;
+                                        data.template get<V>() = 0.0;
+                                }
+                                );
+
+		t_add.stop();
+
+		timer t_flush;
+                t_flush.start();
+                grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+                t_flush.stop();
+
+                timer t_ghost;
+                t_ghost.start();
+                grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+                t_ghost.stop();
+                timer t_ghost2;
+                t_ghost2.start();
+                grid.template ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+                t_ghost2.stop();
+                std::cout << t_ghost.getwct() << std::endl;
+
+                std::cout << "TIME ghost1: " << t_ghost.getwct() << "  ghost2: " << t_ghost2.getwct()  << " flush: " <<  t_flush.getwct() << " " << std::endl;
+
+
+		grid.removeUnusedBuffers();
+
+	}
+
+	std::cout << "Second Pass" <<std::endl;
+
+	for (int i = 0 ; i < 100 ; i++)
+	{
+		Sphere<3,double> sph(p,0.2 + (double)i/160.0);
+		Sphere<3,double> sph2(p,0.4 + (double)i/160.0);
+
+		Box<3,size_t> bx;
+
+		for (int j = 0 ; j < 3 ; j++)
+		{
+			bx.setLow(j,(size_t)((sph.center(j) - 0.4 - (double)i/160.0)/grid.spacing(j)));
+			bx.setHigh(j,(size_t)((sph.center(j) + 0.4 + (double)i/160.0)/grid.spacing(j)));
+		}
+
+		timer t_add;
+		t_add.start();
+
+		grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph,sph2] __device__ (int i, int j, int k)
+                                {
+                                                Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z});
+
+						// Check if the point is in the domain
+                                		if (sph2.isInside(pc) )
+                                		{
+							if (sph.isInside(pc) == false)
+							{return true;}
+						}
+
+                                                return false;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 1.0;
+                                        data.template get<V>() = 0.0;
+                                }
+                                );
+
+		t_add.stop();
+
+
+		timer t_flush;
+                t_flush.start();
+		grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE);
+		t_flush.stop();
+//		grid.removeUnusedBuffers();
+
+
+		timer t_ghost;
+		t_ghost.start();
+		grid.template ghost_get<U,V>(RUN_ON_DEVICE);
+		t_ghost.stop();
+		timer t_ghost2;
+                t_ghost2.start();
+                grid.template ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING);
+                t_ghost2.stop();
+
+		std::cout << "TIME ghost1: " << t_ghost.getwct() << "  ghost2: " << t_ghost2.getwct()  << " flush: " <<  t_flush.getwct() << " " << std::endl;
+
+//		if (i % 10 == 0)
+//		{
+//			grid.template deviceToHost<U,V>();
+//        		grid.write_frame("Final",i);
+//		}
+	}
+
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {384,384,384};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.025;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+#ifdef TEST_RUN
+        // Number of timesteps
+        size_t timeSteps = 300;
+#else
+	// Number of timesteps
+        size_t timeSteps = 50000;
+#endif
+
+	// K and F (Physical constant in the equation)
+        double K = 0.053;
+        double F = 0.014;
+
+	grid_sm<3,void> gv({3,1,1});
+
+	sgrid_type grid(sz,domain,g,bc,0,gv);
+
+	grid.template setBackgroundValue<0>(-0.5);
+	grid.template setBackgroundValue<1>(-0.5);
+	grid.template setBackgroundValue<2>(-0.5);
+	grid.template setBackgroundValue<3>(-0.5);
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	draw_oscillation_shock(grid,domain);
+
+	grid.template deviceToHost<U,V>();
+	grid.write("Final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet  SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile b/example/SparseGrid/8_filling_benchmark/Makefile
similarity index 89%
rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile
rename to example/SparseGrid/8_filling_benchmark/Makefile
index 5bcf5675ed96275648c74c8e616194275dd3b968..94629ac16d26f46dee525a56e3efa9159628326c 100644
--- a/example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile
+++ b/example/SparseGrid/8_filling_benchmark/Makefile
@@ -17,7 +17,7 @@ gray_scott_sparse_opt: $(OBJ)
 all: gray_scott_sparse_opt
 
 run: gray_scott_sparse_opt_test
-	mpirun -np 4 ./gray_scott_sparse_opt
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_opt
 
 .PHONY: clean all run
 
diff --git a/example/SparseGrid/8_filling_benchmark/config.cfg b/example/SparseGrid/8_filling_benchmark/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/SparseGrid/8_filling_benchmark/main.cpp b/example/SparseGrid/8_filling_benchmark/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97e732590e893d51caa25fc2f328c5f01b903f91
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark/main.cpp
@@ -0,0 +1,226 @@
+
+#include "util/cuda_launch.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using sparse grids optimized on CPU # {#e3_gs_gray_scott_opt}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids in an optimized way
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * More or less this example is the adaptation of the dense example in 3D
+ *
+ * \see \ref Grid_3_gs_3D
+ *
+ * This example is the same as \ref e3_gs_gray_scott_sparse the difference is optimizing for speed.
+ *
+ * Two optimization has been done. The first is to change the layout to struct of arrays defining the grid with
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp grid definition
+ *
+ * The second is using the function **conv_cross2** to calculate the right-hand-side
+ * this function can be used to do a convolution that involve points in a cross stencil like in figure that involve
+ * two properties
+ *
+\verbatim
+
+     *
+     *
+ * * x * *
+     *
+     *
+
+\endverbatim
+ *
+ * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use float
+ * we have to use Vc::float_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is
+ * now integrated in openfpm.
+ *
+ *\htmlonly
+ * <a href="https://github.com/VcDevel/Vc" >Vc Library</a>
+ *\endhtmlonly
+ *
+ * Vc::double_v in general pack 1,2,4 doubles dependently from the fact we choose to activate no-SSE,SSE or AVX at compiler level.
+ * The arguments 3 and 4 contain the properties of two selected properties in the cross pattern given by xm xp ym yp zm zp.
+ * The last arguments is instead the mask. The mask can be accessed to check the number of existing points. For example if
+ * we have a cross stencil in 3D with stencil size = 1 than we expect 6 points. Note that the mask is an array because if Vc::double_v
+ * contain 4 doubles than the mask has 4 elements accessed with the array operator []. The call **cross_conv2** also accept
+ * template parameters the first two indicate the source porperties, the other two are the destination properties. While the
+ * last is the extension of the stencil. In this case we use 1.
+ *
+ * The lambda function is defined as
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp lambda
+ *
+ * and used in the body loop
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp body
+ *
+ * To note that instead of copy we split the properties where we are acting at every iteration
+ *
+ */
+
+constexpr int U = 0;
+constexpr int V = 1;
+
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+void init(sgrid_dist_soa<3,double,aggregate<double,double,double,double> > & grid, Box<3,double> & domain)
+{
+	for (int i = 0 ; i < 10 ; i++)
+	{
+		timer t;
+		t.start();
+
+		auto it = grid.getGridIterator();
+
+		while (it.isNext())
+		{
+			// Get the local grid key
+			auto key = it.get_dist();
+
+			// Old values U and V
+			grid.template insert<U>(key) = 1.0;
+
+			++it;
+		}
+
+		t.stop();
+		std::cout << "Time populate: " << t.getwct() << std::endl;
+
+		grid.clear();
+
+                timer t2;
+                t2.start();
+
+                auto it2 = grid.getGridIterator();
+
+                while (it2.isNext())
+                {
+                        // Get the local grid key
+                        auto key = it2.get_dist();
+
+                        // Old values U and V
+                        grid.template insert<U>(key) = 5.0;
+         
+                        ++it2;
+                }
+
+		t2.stop();
+                std::cout << "Time populate: " << t2.getwct() << std::endl;
+
+	}
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {256,256,256};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 0.25;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+	// Number of timesteps
+#ifdef TEST_RUN
+	size_t timeSteps = 200;
+#else
+        size_t timeSteps = 5000;
+#endif
+
+	// K and F (Physical constant in the equation)
+    double K = 0.053;
+    double F = 0.014;
+
+    //! \cond [grid definition] \endcond
+
+	sgrid_dist_soa<3, double, aggregate<double,double,double,double>> grid(sz,domain,g,bc);
+
+    //! \cond [grid definition] \endcond
+
+	// spacing of the grid on x and y
+	double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	size_t count = 0;
+	grid.template ghost_get<U,V>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	double uFactor = deltaT * du/(spacing[x]*spacing[x]);
+	double vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+	timer tot_sim;
+	tot_sim.start();
+
+	auto & v_cl = create_vcluster();
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	grid.write("final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp
+	 *
+	 */
+}
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/Makefile b/example/SparseGrid/8_filling_benchmark_gpu/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c6a4b43bbd6b2846da27e6f3b042aa403d726697
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark_gpu/Makefile
@@ -0,0 +1,51 @@
+include ../../example.mk
+
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifdef CUDA_ON_CPU
+        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
+        CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+else
+        ifeq (, $(shell which nvcc))
+                CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                INCLUDE_PATH_NVCC=
+                CUDA_CC_LINK=mpic++
+                CUDA_OPTIONS=
+        else
+                CUDA_CC=nvcc -ccbin=mpic++
+                CUDA_CC_LINK=nvcc -ccbin=mpic++
+                CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+        endif
+        LIBS_SELECT=$(LIBS)
+endif
+
+
+gray_scott_sparse_gpu_test: OPT += -DTEST_RUN
+gray_scott_sparse_gpu_test: gray_scott_sparse_gpu
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cu
+	$(CUDA_CC) -use_fast_math  -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+gray_scott_sparse_gpu: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT)
+
+all: gray_scott_sparse_gpu
+
+run: gray_scott_sparse_gpu_test
+	mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott_sparse_gpu
+
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/config.cfg b/example/SparseGrid/8_filling_benchmark_gpu/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark_gpu/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/main.cu b/example/SparseGrid/8_filling_benchmark_gpu/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..644a4cc1a39c7429e99220915d66091f0e848118
--- /dev/null
+++ b/example/SparseGrid/8_filling_benchmark_gpu/main.cu
@@ -0,0 +1,221 @@
+#define VCLUSTER_PERF_REPORT
+#define SYNC_BEFORE_TAKE_TIME
+#define ENABLE_GRID_DIST_ID_PERF_STATS
+#include "Decomposition/Distribution/BoxDistribution.hpp"
+#include "util/cuda_launch.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU
+ *
+ * [TOC]
+ *
+ * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu}
+ *
+ * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * More or less this example is the adaptation of the dense example in 3D
+ *
+ * \see \ref Grid_3_gs_3D
+ *
+ * # Initializetion
+ *
+ * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D)
+ * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is
+ * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted.
+ * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k
+ *
+ * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters
+ * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max
+ * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points
+ *
+ * The function can also called with a specified range
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub
+ *
+ * # Update
+ *
+ * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve
+ * two properties
+ *
+ * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen.
+ *
+ * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the
+ * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the
+ * last is the extension of the stencil. In this case we use 1.
+ *
+ * The lambda function is defined as
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda
+ *
+ * and used in the body loop
+ *
+ * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body
+ *
+ */
+
+#ifdef __NVCC__
+
+constexpr int U = 0;
+constexpr int V = 1;
+
+constexpr int U_next = 2;
+constexpr int V_next = 3;
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec;
+
+typedef sgrid_dist_id_gpu<3,float,aggregate<float>,CudaMemory, Dec> SparseGridType;
+
+void init(SparseGridType & grid, Box<3,float> & domain)
+{
+	//! \cond [create points] \endcond
+
+	typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT;
+
+	for (int i = 0 ; i < 10 ; i++)
+	{
+		timer t;
+		t.start();
+
+		grid.addPoints([] __device__ (int i, int j, int k)
+			        {
+						return true;
+			        },
+			        [] __device__ (InsertBlockT & data, int i, int j, int k)
+			        {
+			        	data.template get<U>() = 1.0;
+			        }
+			        );
+
+
+		grid.template flush<smax_<U>>(flush_type::FLUSH_ON_DEVICE);
+
+		t.stop();
+
+		std::cout << "Time populate: " << t.getwct()  << std::endl;
+
+        	timer t2;
+		cudaDeviceSynchronize();
+        	t2.start();
+
+        	grid.addPoints([] __device__ (int i, int j, int k)
+                                {
+                                                return true;
+                                },
+                                [] __device__ (InsertBlockT & data, int i, int j, int k)
+                                {
+                                        data.template get<U>() = 5.0;
+                                }
+                                );
+
+
+        	grid.template flush<sRight_<U>>(flush_type::FLUSH_ON_DEVICE);
+
+		t2.stop();
+
+		std::cout << "Time populate: " << t2.getwct()  << std::endl;
+	}
+}
+
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {512,512,512};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	float deltaT = 0.25;
+
+	// Diffusion constant for specie U
+	float du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	float dv = 1*1e-5;
+
+	// Number of timesteps
+#ifdef TEST_RUN
+	size_t timeSteps = 300;
+#else
+        size_t timeSteps = 15000;
+#endif
+
+	// K and F (Physical constant in the equation)
+    float K = 0.053;
+    float F = 0.014;
+
+	SparseGridType grid(sz,domain,g,bc);
+
+	// spacing of the grid on x and y
+	float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+
+	init(grid,domain);
+
+	// sync the ghost
+	grid.deviceToHost<U>();
+	grid.write("final");
+	grid.print_stats();
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_sparse Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
diff --git a/example/VCluster/0_simple/Makefile b/example/VCluster/0_simple/Makefile
index 721c8d9f7e9f12f7260d5883ef719e5f8a02305c..66fa2b881f2f1fbc6f76897c138b1414dcaed675 100644
--- a/example/VCluster/0_simple/Makefile
+++ b/example/VCluster/0_simple/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 vcluster: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ vcluster: $(OBJ)
 all: vcluster
 
 run: all
-	mpirun -np 3 ./vcluster
+	mpirun --oversubscribe -np 3 ./vcluster
 
 .PHONY: clean all run
 
diff --git a/example/VCluster/1_semantic/Makefile b/example/VCluster/1_semantic/Makefile
index 125663e271cdeff1e9819dcaa3b9bb16b157d289..3a21e758b02baf30977fd2af27d232834ac8a5e0 100644
--- a/example/VCluster/1_semantic/Makefile
+++ b/example/VCluster/1_semantic/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 vcluster: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ vcluster: $(OBJ)
 all: vcluster
 
 run: all
-	mpirun -np 3 ./vcluster
+	mpirun --oversubscribe -np 3 ./vcluster
 
 .PHONY: clean all run
 
diff --git a/example/VCluster/2_serial_and_parallel/Makefile b/example/VCluster/2_serial_and_parallel/Makefile
index 455dced4b3fc5888f5ac1aff2e0389d047b69f83..a5cdc3509433cff895e22e7d051b4f516d1e21cc 100644
--- a/example/VCluster/2_serial_and_parallel/Makefile
+++ b/example/VCluster/2_serial_and_parallel/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 serial_parallel: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ serial_parallel: $(OBJ)
 all: serial_parallel
 
 run: all
-	mpirun -np 3 ./serial_parallel
+	mpirun --oversubscribe -np 3 ./serial_parallel
 
 .PHONY: clean all run
 
diff --git a/example/Vector/0_simple/CMakeLists.txt b/example/Vector/0_simple/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..05aa33049234ff83ffe0ed8255bfc81c83787a27
--- /dev/null
+++ b/example/Vector/0_simple/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+project(openfpm_pdata LANGUAGES C CXX)
+
+set(CMAKE_SKIP_BUILD_RPATH TRUE)   ###### <--------- This is absolutely necessary if you use linear algebra
+
+find_package(openfpm 3.2.0 REQUIRED)
+find_package(Threads)
+find_package(MPI)
+
+if (openfpm_FOUND)
+	message("OpenFPM found")
+	
+	add_executable(vect  main.cpp)
+	target_link_libraries(vect PUBLIC openfpm::binary_config)
+
+	# or
+
+	#target_include_directories(vect PUBLIC ${OPENFPM_INCLUDES})
+	#target_compile_definitions(vect PUBLIC ${OPENFPM_DEFINITION})
+	#target_link_libraries(vect PUBLIC ${OPENFPM_LIBS})
+endif()
+
diff --git a/example/Vector/0_simple/Makefile b/example/Vector/0_simple/Makefile
index 4c6ce566bc1c60d24c3052a03da29571462db213..f5b5d7c2ba73e2f78428594820275e484e23779e 100644
--- a/example/Vector/0_simple/Makefile
+++ b/example/Vector/0_simple/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O0 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O0 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 vect: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,10 @@ vect: $(OBJ)
 all: vect
 
 run: all
-	mpirun -np 2 ./vect
+	mpirun --oversubscribe -np 2 ./vect
+
+debug:
+	../../../gdbgui/launch_mpi_debugger 2 ./vect
 
 .PHONY: clean all run
 
diff --git a/example/Vector/0_simple/config.cfg b/example/Vector/0_simple/config.cfg
index 1eecbac3577c765edca7f90cf5f61cfb6b9f4880..db64a949051852e886e7d45036bd501eb5334e00 100644
--- a/example/Vector/0_simple/config.cfg
+++ b/example/Vector/0_simple/config.cfg
@@ -1,2 +1,2 @@
 [pack]
-files = main.cpp Makefile
+files = main.cpp Makefile CMakeLists.txt
diff --git a/example/Vector/1_HDF5_save_load/Makefile b/example/Vector/1_HDF5_save_load/Makefile
index 703912dfbb69414b088bbaa7fa532977f2ba03c5..d6be8ec7a0cd7f75e4fc16d2d27171653c077bb0 100644
--- a/example/Vector/1_HDF5_save_load/Makefile
+++ b/example/Vector/1_HDF5_save_load/Makefile
@@ -11,14 +11,14 @@ all: hdf5
 
 
 %.o: %.cpp
-	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 hdf5: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 
 run: hdf5
-	mpirun -np 2 ./hdf5
+	mpirun --oversubscribe -np 2 ./hdf5
 
 .PHONY: clean all run
 
diff --git a/example/Vector/1_celllist/Makefile b/example/Vector/1_celllist/Makefile
index c9a76a23b45b50e073b571f48fa3653a60cdbc13..4a314c419846bb9e89c935162497be42abbd179b 100644
--- a/example/Vector/1_celllist/Makefile
+++ b/example/Vector/1_celllist/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 cell: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ cell: $(OBJ)
 all: cell
 
 run: all
-	mpirun -np 2 ./cell
+	mpirun --oversubscribe -np 2 ./cell
 
 .PHONY: clean all run
 
diff --git a/example/Vector/1_ghost_get_put/Makefile b/example/Vector/1_ghost_get_put/Makefile
index d0683987f533b794fc86798104339b2ee6c58802..b3e3619da3c4c69aad0a845b8ef970ddea69414d 100644
--- a/example/Vector/1_ghost_get_put/Makefile
+++ b/example/Vector/1_ghost_get_put/Makefile
@@ -15,7 +15,7 @@ ghost: $(OBJ)
 all: ghost
 
 run: all
-	mpirun -np 2 ./ghost
+	mpirun --oversubscribe -np 2 ./ghost
 
 .PHONY: clean all run
 
diff --git a/example/Vector/1_gpu_first_step/Makefile b/example/Vector/1_gpu_first_step/Makefile
index a2560310b0853a99448ab9231748e5cf6ba61abf..5234dece565c283ef1a09a3cf15d659bfb5ae508 100644
--- a/example/Vector/1_gpu_first_step/Makefile
+++ b/example/Vector/1_gpu_first_step/Makefile
@@ -1,33 +1,48 @@
 include ../../example.mk
+LIBS_CUDA_ON_CPU=$(LIBS)
 
 CUDA_CC=
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
-        INCLUDE_PATH_NVCC=
+CC=mpic++
+ifdef HIP
+	CUDA_CC=hipcc
+	CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+	LIBS_SELECT=$(LIBS)
+	CC=hipcc
 else
-        CUDA_CC=nvcc -ccbin=mpic++
+	ifdef CUDA_ON_CPU
+		CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        	LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+	else
+		ifeq (, $(shell which nvcc))
+        		CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        		INCLUDE_PATH_NVCC=
+		else
+        		CUDA_CC=nvcc -ccbin=mpic++
+		endif
+		LIBS_SELECT=$(LIBS)
+	endif
 endif
 
 
-CC=mpic++
-
 OBJ = main.o
 
 gpu_fstep:
 
 %.o: %.cu
-	$(CUDA_CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) -O3 -g $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
 	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
 
 gpu_fstep: $(OBJ)
-	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
 all: gpu_fstep
 
 run: gpu_fstep
-	mpirun -np 2 ./gpu_fstep
+	mpirun --oversubscribe -np 2 ./gpu_fstep
 
 .PHONY: clean all run
 
diff --git a/example/Vector/1_gpu_first_step/main.cu b/example/Vector/1_gpu_first_step/main.cu
index 0bb965e8022f7bed4fedc5985b2a84603af0e8dd..30a69fcde617fddf607945c0c0fcce5e786baec9 100644
--- a/example/Vector/1_gpu_first_step/main.cu
+++ b/example/Vector/1_gpu_first_step/main.cu
@@ -99,6 +99,21 @@
  *   \snippet Vector/1_gpu_first_step/main.cu using_openmpi
  *
  * * MPI must be compiled with CUDA support (in general installing OpenFPM with -g should attempt to install OpenMPI with CUDA support)
+ * 
+ * ## Macro CUDA_LAUNCH
+ *
+ * When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax
+ *
+ * my_kernel<<<wthr,thr>>>(arguments ... )
+ *
+ * Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel.
+ * Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where
+ * ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1
+ * all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility
+ * to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g)  or with CUDA_ON_CPU support
+ * (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this
+ * feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support
+ * AMD GPUs as well using this system.
  *
  * ## Full code ## {#code_e0_sim}
  *
@@ -211,7 +226,8 @@ int main(int argc, char* argv[])
 	//! \cond [launch_domain_it] \endcond
 
 	auto ite = vd.getDomainIteratorGPU();
-	translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+	// translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
 
 	//! \cond [launch_domain_it] \endcond
 
@@ -230,7 +246,8 @@ int main(int argc, char* argv[])
 	for (int j = 0 ; j < 100 ; j++)
 	{
 		auto ite = vd.getDomainIteratorGPU();
-		translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+		// translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
+		CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
 
 		vd.map(RUN_ON_DEVICE);
 		vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
diff --git a/example/Vector/2_expressions/Makefile b/example/Vector/2_expressions/Makefile
index 7ab3093eedd5282dcf9269bd222005e21e2bf357..bffde476e912b72ad60e9472817b2489c9af5c8e 100644
--- a/example/Vector/2_expressions/Makefile
+++ b/example/Vector/2_expressions/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 expr: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ expr: $(OBJ)
 all: expr
 
 run: all
-	mpirun -np 3 ./expr
+	mpirun --oversubscribe -np 3 ./expr
 
 .PHONY: clean all run
 
diff --git a/example/Vector/3_molecular_dynamic/Makefile b/example/Vector/3_molecular_dynamic/Makefile
index fab9a79d7e8c2b60a6ff5772b6318d2a9b37da0b..e4baa10c28a3bf9b83eef852ee833b6605cdd918 100644
--- a/example/Vector/3_molecular_dynamic/Makefile
+++ b/example/Vector/3_molecular_dynamic/Makefile
@@ -12,7 +12,7 @@ OBJ_EXPR_PAP = main_expr_paper.o
 all: md_dyn md_dyn_expr md_dyn_vl md_dyn_pap
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 md_dyn: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -27,7 +27,7 @@ md_dyn_pap: $(OBJ_EXPR_PAP)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: all
-	mpirun -np 3 ./md_dyn && mpirun -np 3 ./md_dyn_expr && mpirun -np 3 ./md_dyn_vl;
+	mpirun --oversubscribe -np 3 ./md_dyn && mpirun --oversubscribe -np 3 ./md_dyn_expr && mpirun --oversubscribe -np 3 ./md_dyn_vl;
 
 .PHONY: clean all run
 
diff --git a/example/Vector/3_molecular_dynamic_gpu/Makefile b/example/Vector/3_molecular_dynamic_gpu/Makefile
index 614460f3072c9d4f7797f3645029337d2a4078f5..43a60ef146d87adb9aa889aa82aca5bf64105c18 100644
--- a/example/Vector/3_molecular_dynamic_gpu/Makefile
+++ b/example/Vector/3_molecular_dynamic_gpu/Makefile
@@ -4,13 +4,21 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+ifdef CUDA_ON_CPU
+	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
+	LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-        CUDA_CC_LINK=nvcc -ccbin=mpic++
+	ifeq (, $(shell which nvcc))
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+       		CUDA_CC_LINK=mpic++
+	else
+        	CUDA_CC=nvcc -ccbin=mpic++
+        	CUDA_CC_LINK=nvcc -ccbin=mpic++
+	endif
+	LIBS_SELECT=$(LIBS)
 endif
 
 CC=mpic++
@@ -29,11 +37,11 @@ md_dyn_test: md_dyn
 	$(CUDA_CC) $(OPT) -O3 -g -c  --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 md_dyn: $(OBJ)
-	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS)
+	$(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT)
 
 
 run: md_dyn_test
-	mpirun -np 3 ./md_dyn;
+	mpirun --oversubscribe -np 3 ./md_dyn;
 
 .PHONY: clean all run
 
diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/Makefile b/example/Vector/3_molecular_dynamic_gpu_opt/Makefile
index b34bd0f1fc9f8980106878c106dd6dfdb07be105..9bece43f338db1de8faa0d1bbf058883c6ade48c 100644
--- a/example/Vector/3_molecular_dynamic_gpu_opt/Makefile
+++ b/example/Vector/3_molecular_dynamic_gpu_opt/Makefile
@@ -1,13 +1,23 @@
 include ../../example.mk
 
 CUDA_CC=
-ifeq (, $(shell which nvcc))
+
+ifdef CUDA_ON_CPU
         CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
+        CUDA_CC_LINK=mpic++
 else
-        CUDA_CC=nvcc -ccbin=mpic++
+        ifeq (, $(shell which nvcc))
+                CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                INCLUDE_PATH_NVCC=
+                CUDA_CC_LINK=mpic++
+        else
+                CUDA_CC=nvcc -ccbin=mpic++
+                CUDA_CC_LINK=nvcc -ccbin=mpic++
+        endif
 endif
 
+
 ifeq ($(PROFILE),ON)
 	CC=scorep --nocompiler  --cuda --mpp=mpi nvcc
 	CC_MPI=mpic++
@@ -29,10 +39,10 @@ md_dyn_gpu_test: OPT += -DTEST_RUN
 md_dyn_gpu_test: all
 
 %.o: %.cu
-	$(CC) $(OPT)  -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CC) $(OPT)  -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
-	$(CC_MPI) $(OPT) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC_MPI) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 md_dyn_gpu: $(OBJ_GPU)
 	$(CC_MPI) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -44,7 +54,7 @@ md_dyn_cpu_best: $(OBJ_CPU_BEST)
 	$(CC_MPI) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: md_dyn_gpu_test
-	mpirun -np 3 ./md_dyn_gpu && mpirun -np 3 ./md_dyn_cpu && mpirun -np 3 ./md_dyn_cpu_best;
+	mpirun --oversubscribe -np 3 ./md_dyn_gpu && mpirun --oversubscribe -np 3 ./md_dyn_cpu && mpirun --oversubscribe -np 3 ./md_dyn_cpu_best;
 
 .PHONY: clean all run
 
diff --git a/example/Vector/4_complex_prop/Makefile b/example/Vector/4_complex_prop/Makefile
index 2de12f95f6b02c804e5fa5a0b835d5c78fa9337d..3b8ea179ce3584e6fd2dd710f7e709335864cba2 100644
--- a/example/Vector/4_complex_prop/Makefile
+++ b/example/Vector/4_complex_prop/Makefile
@@ -10,7 +10,7 @@ OBJ_SER = main_ser.o
 all: vect_cp vect_ser
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 vect_cp: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -19,7 +19,7 @@ vect_ser: $(OBJ_SER)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: all
-	mpirun -np 2 ./vect_cp && mpirun -np 2 ./vect_ser
+	mpirun --oversubscribe  -np 2 ./vect_cp && mpirun --oversubscribe -np 2 ./vect_ser
 
 .PHONY: clean all run
 
diff --git a/example/Vector/4_multiphase_celllist_verlet/Makefile b/example/Vector/4_multiphase_celllist_verlet/Makefile
index e14eedbb41b3cc4cd79451b34eebcc209de98f2a..502002fc388e046c14050d584cf8810cde53b38c 100644
--- a/example/Vector/4_multiphase_celllist_verlet/Makefile
+++ b/example/Vector/4_multiphase_celllist_verlet/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 multip: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -15,7 +15,7 @@ multip: $(OBJ)
 all: multip
 
 run: all
-	mpirun -np 2 ./multip
+	mpirun --oversubscribe -np 2 ./multip
 
 .PHONY: clean all run
 
diff --git a/example/Vector/4_reorder/Makefile b/example/Vector/4_reorder/Makefile
index 2569b0b64d49609a916adb2b92b36a3d9c566461..e58f415bedc65a21c776e85b4e0365fbb8ed87dd 100644
--- a/example/Vector/4_reorder/Makefile
+++ b/example/Vector/4_reorder/Makefile
@@ -13,7 +13,7 @@ all_test: OPT += -DTEST_RUN
 all_test: md_data_ord_test md_comp_ord_test
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH)
 
 md_data_ord: $(OBJ_DORD)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -28,7 +28,7 @@ md_comp_ord_test: $(OBJ_CORD)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: all_test
-	mpirun -np 4 ./md_data_ord_test && mpirun -np 4 ./md_comp_ord_test
+	mpirun --oversubscribe -np 4 ./md_data_ord_test && mpirun --oversubscribe -np 4 ./md_comp_ord_test
 
 .PHONY: clean all run all_test on_test
 
diff --git a/example/Vector/5_molecular_dynamic_sym/Makefile b/example/Vector/5_molecular_dynamic_sym/Makefile
index cf0391fab74d27ace055d5904d8df2c180b2266d..80bef9b502d62d355b77f7d0850aab280949aae8 100644
--- a/example/Vector/5_molecular_dynamic_sym/Makefile
+++ b/example/Vector/5_molecular_dynamic_sym/Makefile
@@ -9,13 +9,13 @@ OBJ_DORD = main.o
 all: md_sym
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH)
 
 md_sym: $(OBJ_DORD)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: md_sym
-	mpirun -np 3 ./md_sym
+	mpirun --oversubscribe -np 3 ./md_sym
 
 .PHONY: clean all run
 
diff --git a/example/Vector/5_molecular_dynamic_sym_crs/Makefile b/example/Vector/5_molecular_dynamic_sym_crs/Makefile
index cf0391fab74d27ace055d5904d8df2c180b2266d..80bef9b502d62d355b77f7d0850aab280949aae8 100644
--- a/example/Vector/5_molecular_dynamic_sym_crs/Makefile
+++ b/example/Vector/5_molecular_dynamic_sym_crs/Makefile
@@ -9,13 +9,13 @@ OBJ_DORD = main.o
 all: md_sym
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH)
 
 md_sym: $(OBJ_DORD)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: md_sym
-	mpirun -np 3 ./md_sym
+	mpirun --oversubscribe -np 3 ./md_sym
 
 .PHONY: clean all run
 
diff --git a/example/Vector/6_complex_usage/Makefile b/example/Vector/6_complex_usage/Makefile
index 31385b0c5b98849b22cab8969db58365498defc6..a1fb90fd199e06caff755ef62c8184e9bc25d551 100644
--- a/example/Vector/6_complex_usage/Makefile
+++ b/example/Vector/6_complex_usage/Makefile
@@ -9,13 +9,13 @@ OBJ_DORD = main.o
 all: complex_use
 
 %.o: %.cpp
-	$(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH)
 
 complex_use: $(OBJ_DORD)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 run: all
-	mpirun -np 3 ./complex_use
+	mpirun --oversubscribe -np 3 ./complex_use
 
 .PHONY: clean all run all_test on_test
 
diff --git a/example/Vector/7_SPH_dlb/CMakeLists.txt b/example/Vector/7_SPH_dlb/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..286edddbc4f5e2df68949c96dc6bb6c4a360ba3c
--- /dev/null
+++ b/example/Vector/7_SPH_dlb/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+project(openfpm_pdata LANGUAGES C CXX)
+
+set(CMAKE_SKIP_BUILD_RPATH TRUE)   ###### <--------- This is absolutely necessary if you use linear algebra
+
+find_package(openfpm 3.2.0 REQUIRED)
+find_package(Threads)
+find_package(MPI)
+
+if (openfpm_FOUND)
+	message("OpenFPM found")
+	
+	add_executable(sph_dlb  main.cpp)
+	target_link_libraries(sph_dlb PUBLIC openfpm::binary_config)
+
+	# or
+
+	#target_include_directories(sph_dlb PUBLIC ${OPENFPM_INCLUDES})
+	#target_compile_definitions(sph_dlb PUBLIC ${OPENFPM_DEFINITION})
+	#target_link_libraries(sph_dlb PUBLIC ${OPENFPM_LIBS})
+	#target_compile_options(sph_dlb PUBLIC ${OPENFPM_COMPILE_OPTIONS})
+endif()
+
diff --git a/example/Vector/7_SPH_dlb/Makefile b/example/Vector/7_SPH_dlb/Makefile
index 0fc85bdd4ed108063c48e4fe7ccb8bd12039be95..a039eef590d8a60363ec39ef2b7663f5d61579de 100644
--- a/example/Vector/7_SPH_dlb/Makefile
+++ b/example/Vector/7_SPH_dlb/Makefile
@@ -12,7 +12,7 @@ sph_dlb_test: OPT += -DTEST_RUN
 sph_dlb_test: sph_dlb
 
 %.o: %.cpp
-	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g  $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 sph_dlb: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -20,7 +20,7 @@ sph_dlb: $(OBJ)
 all: sph_dlb
 
 run: sph_dlb_test
-	mpirun -np 2 ./sph_dlb
+	mpirun --oversubscribe -np 2 ./sph_dlb
 
 .PHONY: clean all run
 
diff --git a/example/Vector/7_SPH_dlb/config.cfg b/example/Vector/7_SPH_dlb/config.cfg
index 1eecbac3577c765edca7f90cf5f61cfb6b9f4880..db64a949051852e886e7d45036bd501eb5334e00 100644
--- a/example/Vector/7_SPH_dlb/config.cfg
+++ b/example/Vector/7_SPH_dlb/config.cfg
@@ -1,2 +1,2 @@
 [pack]
-files = main.cpp Makefile
+files = main.cpp Makefile CMakeLists.txt
diff --git a/example/Vector/7_SPH_dlb_gpu/CMakeLists.txt b/example/Vector/7_SPH_dlb_gpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..343f24fd0612e8cafdbb80d4acfc35d718d1e33e
--- /dev/null
+++ b/example/Vector/7_SPH_dlb_gpu/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+project(openfpm_pdata LANGUAGES C CXX)
+
+set(CMAKE_SKIP_BUILD_RPATH TRUE)   ###### <--------- This is absolutely necessary if you use linear algebra
+
+find_package(openfpm 3.2.0 REQUIRED)
+find_package(Threads)
+
+enable_language(CUDA)
+
+if (openfpm_FOUND)
+	message("OpenFPM found")
+	
+	add_executable(sph_dlb  main.cu)
+	target_link_libraries(sph_dlb PUBLIC openfpm::binary_config)	
+
+	#or
+
+	#target_include_directories(sph_dlb PUBLIC ${OPENFPM_INCLUDES})
+	#target_compile_definitions(sph_dlb PUBLIC ${OPENFPM_DEFINITION})
+	#target_link_libraries(sph_dlb PUBLIC ${OPENFPM_LIBS})
+	#target_compile_options(sph_dlb PUBLIC ${OPENFPM_COMPILE_OPTIONS})
+endif()
+
diff --git a/example/Vector/7_SPH_dlb_gpu/Makefile b/example/Vector/7_SPH_dlb_gpu/Makefile
index 059eb412ccfa0246aea7c770c73e582916f332ad..8813b2db0ecd3b2b8764579fea94295107af65ea 100644
--- a/example/Vector/7_SPH_dlb_gpu/Makefile
+++ b/example/Vector/7_SPH_dlb_gpu/Makefile
@@ -4,13 +4,32 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
-        INCLUDE_PATH_NVCC=
-	CUDA_CC_LINK=mpic++
+
+CC=mpic++
+ifdef HIP
+        CUDA_CC=hipcc
+        CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+        LIBS_SELECT=$(LIBS)
+        CC=hipcc
+	CUDA_CC_LINK=hipcc
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-	CUDA_CC_LINK=nvcc -ccbin=mpic++
+	ifdef CUDA_ON_CPU
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+        	CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        	LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+	else
+        	ifeq (, $(shell which nvcc))
+                	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                	INCLUDE_PATH_NVCC=
+                	CUDA_CC_LINK=mpic++
+        	else
+                	CUDA_CC=nvcc -ccbin=mpic++
+                	CUDA_CC_LINK=nvcc -ccbin=mpic++
+        	endif
+		LIBS_SELECT=$(LIBS)
+	endif
 endif
 
 CC=mpic++
@@ -25,18 +44,18 @@ sph_dlb_test: OPT += -DTEST_RUN
 sph_dlb_test: sph_dlb
 
 %.o: %.cu
-	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) $(CUDA_OPTIONS) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
 	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 sph_dlb: $(OBJ)
-	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
 all: sph_dlb
 
 run: sph_dlb_test
-	mpirun -np 2 ./sph_dlb
+	mpirun --oversubscribe -np 2 ./sph_dlb
 
 .PHONY: clean all run
 
diff --git a/example/Vector/7_SPH_dlb_gpu/config.cfg b/example/Vector/7_SPH_dlb_gpu/config.cfg
index 699be429e147cd40187be6ce345ef2f060f59fbc..be08dcf5eba6ca028a142535d42b254cba7a353b 100644
--- a/example/Vector/7_SPH_dlb_gpu/config.cfg
+++ b/example/Vector/7_SPH_dlb_gpu/config.cfg
@@ -1,2 +1,2 @@
 [pack]
-files = main.cu Makefile
+files = main.cu Makefile CMakeLists.txt
diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu
index 08d91fbe2e6f5e0492b96b04a7f6a7efb875d3a0..701a0623c3b601afcb77a7bdf5e5987367b8d9b8 100644
--- a/example/Vector/7_SPH_dlb_gpu/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu/main.cu
@@ -39,6 +39,21 @@
  *
  * \snippet Vector/7_SPH_dlb_gpu/main.cu mark_to_remove_kernel
  *
+ * ## Macro CUDA_LAUNCH
+ *
+ * When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax
+ *
+ * my_kernel<<<wthr,thr>>>(arguments ... )
+ *
+ * Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel. 
+ * Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where
+ * ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1
+ * all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility
+ * to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g)  or with CUDA_ON_CPU support 
+ * (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this 
+ * feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support
+ * AMD GPUs as well using this system.
+ *
  * \include Vector/7_SPH_dlb_gpu_opt/main.cu
  *
  */
@@ -195,7 +210,10 @@ inline void EqState(particles & vd)
 {
 	auto it = vd.getDomainIteratorGPU();
 
-	EqState_gpu<<<it.wthr,it.thr>>>(vd.toKernel(),B);
+	// You can use standard CUDA kernel launch or the macro CUDA_LAUNCH
+
+	//EqState_gpuning<<<it.wthr,it.thr>>>(vd.toKernel(),B);
+	CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B)
 }
 
 
@@ -301,16 +319,16 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	Point<3,real_number> xa = vd.getPos(a);
 
 	// Take the mass of the particle dependently if it is FLUID or BOUNDARY
-	real_number massa = (vd.getProp<type>(a) == FLUID)?MassFluid:MassBound;
+	real_number massa = (vd.template getProp<type>(a) == FLUID)?MassFluid:MassBound;
 
 	// Get the density of the of the particle a
-	real_number rhoa = vd.getProp<rho>(a);
+	real_number rhoa = vd.template getProp<rho>(a);
 
 	// Get the pressure of the particle a
-	real_number Pa = vd.getProp<Pressure>(a);
+	real_number Pa = vd.template getProp<Pressure>(a);
 
 	// Get the Velocity of the particle a
-	Point<3,real_number> va = vd.getProp<velocity>(a);
+	Point<3,real_number> va = vd.template getProp<velocity>(a);
 
 	// Reset the force counter (- gravity on zeta direction)
 	vd.template getProp<force>(a)[0] = 0.0;
@@ -319,7 +337,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	vd.template getProp<drho>(a) = 0.0;
 
 	// We threat FLUID particle differently from BOUNDARY PARTICLES ...
-	if (vd.getProp<type>(a) != FLUID)
+	if (vd.template getProp<type>(a) != FLUID)
 	{
 
 		// If it is a boundary particle calculate the delta rho based on equation 2
@@ -339,14 +357,14 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			if (a == b)	{++Np; continue;};
 
 			// get the mass of the particle
-			real_number massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
+			real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound;
 
 			// Get the velocity of the particle b
-			Point<3,real_number> vb = vd.getProp<velocity>(b);
+			Point<3,real_number> vb = vd.template getProp<velocity>(b);
 
 			// Get the pressure and density of particle b
-			real_number Pb = vd.getProp<Pressure>(b);
-			real_number rhob = vd.getProp<rho>(b);
+			real_number Pb = vd.template getProp<Pressure>(b);
+			real_number rhob = vd.template getProp<rho>(b);
 
 			// Get the distance between p and q
 			Point<3,real_number> dr = xa - xb;
@@ -374,7 +392,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			++Np;
 		}
 
-		vd.getProp<red>(a) = max_visc;
+		vd.template getProp<red>(a) = max_visc;
 	}
 	else
 	{
@@ -395,10 +413,10 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			// if (p == q) skip this particle
 			if (a == b)	{++Np; continue;};
 
-			real_number massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
-			Point<3,real_number> vb = vd.getProp<velocity>(b);
-			real_number Pb = vd.getProp<Pressure>(b);
-			real_number rhob = vd.getProp<rho>(b);
+			real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound;
+			Point<3,real_number> vb = vd.template getProp<velocity>(b);
+			real_number Pb = vd.template getProp<Pressure>(b);
+			real_number rhob = vd.template getProp<rho>(b);
 
 			// Get the distance between p and q
 			Point<3,real_number> dr = xa - xb;
@@ -415,7 +433,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 				Point<3,real_number> DW;
 				DWab(dr,DW,r,false);
 
-				real_number factor = - massb*((vd.getProp<Pressure>(a) + vd.getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
+				real_number factor = - massb*((vd.template getProp<Pressure>(a) + vd.template getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
 
 				vd.template getProp<force>(a)[0] += factor * DW.get(0);
 				vd.template getProp<force>(a)[1] += factor * DW.get(1);
@@ -427,7 +445,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 			++Np;
 		}
 
-		vd.getProp<red>(a) = max_visc;
+		vd.template getProp<red>(a) = max_visc;
 	}
 }
 
@@ -438,7 +456,8 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 	// Update the cell-list
 	vd.updateCellList(NN);
 
-	calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar);
+	//calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar);
+	CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar)
 
 	max_visc = reduce_local<red,_max_>(vd);
 }
@@ -448,11 +467,11 @@ __global__ void max_acceleration_and_velocity_gpu(vector_type vd)
 {
 	auto a = GET_PARTICLE(vd);
 
-	Point<3,real_number> acc(vd.getProp<force>(a));
-	vd.getProp<red>(a) = norm(acc);
+	Point<3,real_number> acc(vd.template getProp<force>(a));
+	vd.template getProp<red>(a) = norm(acc);
 
-	Point<3,real_number> vel(vd.getProp<velocity>(a));
-	vd.getProp<red2>(a) = norm(vel);
+	Point<3,real_number> vel(vd.template getProp<velocity>(a));
+	vd.template getProp<red2>(a) = norm(vel);
 }
 
 void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_number & max_vel)
@@ -460,7 +479,8 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 	// Calculate the maximum acceleration
 	auto part = vd.getDomainIteratorGPU();
 
-	max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	// max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
 
 	max_acc = reduce_local<red,_max_>(vd);
 	max_vel = reduce_local<red2,_max_>(vd);
@@ -566,7 +586,8 @@ void verlet_int(particles & vd, real_number dt)
 	real_number dt205 = dt*dt*0.5;
 	real_number dt2 = dt*2.0;
 
-	verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
+	// verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
+	CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205);
 
 	//! \cond [remove_marked_part] \endcond
 
@@ -646,7 +667,8 @@ void euler_int(particles & vd, real_number dt)
 
 	real_number dt205 = dt*dt*0.5;
 
-	euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	// euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205);
 
 	// remove the particles
 	remove_marked<red>(vd);
@@ -722,7 +744,8 @@ inline void sensor_pressure(Vector & vd,
         // if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			// sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
 
 			// move calculated pressure on
 			press_tmp_.deviceToHost();
@@ -771,7 +794,7 @@ int main(int argc, char* argv[])
 
 	// You can ignore all these dp/2.0 is a trick to reach the same initialization
 	// of Dual-SPH that use a different criteria to draw particles
-	Box<3,real_number> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0});
+	Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f});
 
 	// return an iterator to the fluid particles to add to vd
 	auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box);
@@ -820,12 +843,12 @@ int main(int argc, char* argv[])
 	}
 
 	// Recipient
-	Box<3,real_number> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0});
-	Box<3,real_number> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0});
+	Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f});
+	Box<3,real_number> recipient2({dp,dp,dp},{1.6f-dp/2.0f,0.67f-dp/2.0f,0.4f+dp/2.0f});
 
-	Box<3,real_number> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0});
-	Box<3,real_number> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0});
-	Box<3,real_number> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45});
+	Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f});
+	Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f});
+	Box<3,real_number> obstacle3({0.9f+dp,0.24f,0.0f},{1.02f,0.36f,0.45f});
 
 	openfpm::vector<Box<3,real_number>> holes;
 	holes.add(recipient2);
@@ -877,7 +900,7 @@ int main(int argc, char* argv[])
 
 		++obstacle_box;
 	}
-
+	
 	vd.map();
 
 	// Now that we fill the vector with particles
@@ -886,13 +909,13 @@ int main(int argc, char* argv[])
 	vd.addComputationCosts(md);
 	vd.getDecomposition().decompose();
 	vd.map();
-
+    
 	///////////////////////////
 
 	// Ok the initialization is done on CPU on GPU we are doing the main loop, so first we offload all properties on GPU
 
 	vd.hostToDevicePos();
-	vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity>();
+	vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity,velocity_prev>();
 
 	vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile b/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile
index ef2f64a0929810b20de0696874c7bb0a34bc9c7f..e91b646cd03080625765b240d615eb8eb022403b 100644
--- a/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile
+++ b/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile
@@ -4,15 +4,22 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+ifdef CUDA_ON_CPU
+	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
-	CUDA_OPTIONS=
+        CUDA_OPTIONS=
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-        CUDA_CC_LINK=nvcc -ccbin=mpic++
-	CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+	ifeq (, $(shell which nvcc))
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+		CUDA_OPTIONS=
+	else
+        	CUDA_CC=nvcc -ccbin=mpic++
+        	CUDA_CC_LINK=nvcc -ccbin=mpic++
+		CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+	endif
 endif
 
 ifeq ($(PROFILE),ON)
@@ -47,7 +54,7 @@ sph_dlb2: $(OBJ)
 all: sph_dlb
 
 run: sph_dlb_test
-	mpirun -np 2 ./sph_dlb
+	mpirun --oversubscribe -np 2 ./sph_dlb
 
 .PHONY: clean all run
 
diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
index 2d823d7f4178b32fdf5aa3e3e74896ea73074fb3..4535301905776407eb225b9398c7240a1a1b4b0c 100644
--- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
@@ -53,6 +53,8 @@
 #define PRINT_STACKTRACE
 #define STOP_ON_ERROR
 #define OPENMPI
+#define SCAN_WITH_CUB
+#define SORT_WITH_CUB
 //#define SE_CLASS1
 
 //#define USE_LOW_REGISTER_ITERATOR
@@ -277,7 +279,7 @@ inline __device__ __host__  real_number Tensile(real_number r, real_number rhoa,
 	    real_number wqq2=qq*qq;
 	    real_number wqq3=wqq2*qq;
 
-	    wab+=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
+	    wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
 	}
 
 	//-Tensile correction.
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/Makefile b/example/Vector/7_SPH_dlb_gpu_opt/Makefile
index ef2f64a0929810b20de0696874c7bb0a34bc9c7f..e5b464ed3b5d5bbd477deb304f7cd3d689f06d10 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/Makefile
+++ b/example/Vector/7_SPH_dlb_gpu_opt/Makefile
@@ -4,17 +4,35 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
-        INCLUDE_PATH_NVCC=
-        CUDA_CC_LINK=mpic++
-	CUDA_OPTIONS=
+ifdef HIP
+        CUDA_CC=hipcc
+        CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+        LIBS_SELECT=$(LIBS)
+        CC=hipcc
+        CUDA_CC_LINK=hipcc
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-        CUDA_CC_LINK=nvcc -ccbin=mpic++
-	CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+	ifdef CUDA_ON_CPU
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+		CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+		LIBS_SELECT=$(LIBS_CUDA_ON_CPU)
+	else
+        	ifeq (, $(shell which nvcc))
+                	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                	INCLUDE_PATH_NVCC=
+                	CUDA_CC_LINK=mpic++
+			CUDA_OPTIONS=
+        	else
+                	CUDA_CC=nvcc -ccbin=mpic++
+                	CUDA_CC_LINK=nvcc -ccbin=mpic++
+			CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+        	endif
+		LIBS_SELECT=$(LIBS)
+	endif
 endif
 
+
 ifeq ($(PROFILE),ON)
         CUDA_CC=scorep --nocompiler  --cuda --mpp=mpi nvcc -ccbin=mpic++
         CUDA_CC_LINK=scorep --nocompiler  --cuda --mpp=mpi nvcc -ccbin=mpic++
@@ -39,15 +57,15 @@ sph_dlb_test: sph_dlb
 	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 sph_dlb: $(OBJ)
-	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+	$(CUDA_CC_LINK) -o  $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
 sph_dlb2: $(OBJ)
-	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
 all: sph_dlb
 
 run: sph_dlb_test
-	mpirun -np 2 ./sph_dlb
+	mpirun --oversubscribe -np 2 ./sph_dlb
 
 .PHONY: clean all run
 
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index 17f6f7f3d70a6113a315b34f6e333085798fe91c..8dc98d604da3e2dad8e08321b164a8e93fa10f44 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -45,13 +45,13 @@
 //#define SE_CLASS1
 
 //#define USE_LOW_REGISTER_ITERATOR
-//#define SCAN_WITH_CUB <------ In case you want to use CUB for scan operations
+#define SCAN_WITH_CUB //<------ In case you want to use CUB for scan operations
+#define SORT_WITH_CUB
 //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default
 
 #include "Vector/vector_dist.hpp"
 #include <math.h>
 #include "Draw/DrawParticles.hpp"
-#include <cuda_profiler_api.h>
 
 
 
@@ -250,6 +250,7 @@ inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_num
     DW.get(2) = factor * dx.get(2);
 }
 
+
 // Tensile correction
 inline __device__ __host__  real_number Tensile(real_number r, real_number rhoa, real_number rhob, real_number prs1, real_number prs2, real_number W_dap)
 {
@@ -268,7 +269,7 @@ inline __device__ __host__  real_number Tensile(real_number r, real_number rhoa,
 	    real_number wqq2=qq*qq;
 	    real_number wqq3=wqq2*qq;
 
-	    wab+=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
+	    wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
 	}
 
 	//-Tensile correction.
@@ -313,19 +314,19 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	Point<3,real_number> xa = vd.getPos(a);
 
 	// Type of the particle
-	unsigned int typea = vd.getProp<type>(a);
+	unsigned int typea = vd.template getProp<type>(a);
 
 	// Take the mass of the particle dependently if it is FLUID or BOUNDARY
 	//real_number massa = (typea == FLUID)?MassFluid:MassBound;
 
 	// Get the density of the of the particle a
-	real_number rhoa = vd.getProp<rho>(a);
+	real_number rhoa = vd.template getProp<rho>(a);
 
 	// Get the pressure of the particle a
-	real_number Pa = vd.getProp<Pressure>(a);
+	real_number Pa = vd.template getProp<Pressure>(a);
 
 	// Get the Velocity of the particle a
-	Point<3,real_number> va = vd.getProp<velocity>(a);
+	Point<3,real_number> va = vd.template getProp<velocity>(a);
 
 	Point<3,real_number> force_;
 	force_.get(0) = 0.0f;
@@ -348,12 +349,12 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 		// if (p == q) skip this particle this condition should be done in the r^2 = 0
 		if (a == b)	{++Np; continue;};
 
-        unsigned int typeb = vd.getProp<type>(b);
+        	unsigned int typeb = vd.template getProp<type>(b);
 
-        real_number massb = (typeb == FLUID)?MassFluid:MassBound;
-        Point<3,real_number> vb = vd.getProp<velocity>(b);
-        real_number Pb = vd.getProp<Pressure>(b);
-        real_number rhob = vd.getProp<rho>(b);
+        	real_number massb = (typeb == FLUID)?MassFluid:MassBound;
+        	Point<3,real_number> vb = vd.template getProp<velocity>(b);
+        	real_number Pb = vd.template getProp<Pressure>(b);
+        	real_number rhob = vd.template getProp<rho>(b);
 
 		// Get the distance between p and q
 		Point<3,real_number> dr = xa - xb;
@@ -387,7 +388,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 		++Np;
 	}
 
-	vd.getProp<red>(a) = max_visc;
+	vd.template getProp<red>(a) = max_visc;
 
 	vd.template getProp<force>(a)[0] = force_.get(0);
 	vd.template getProp<force>(a)[1] = force_.get(1);
@@ -414,11 +415,11 @@ __global__ void max_acceleration_and_velocity_gpu(vector_type vd)
 {
 	auto a = GET_PARTICLE(vd);
 
-	Point<3,real_number> acc(vd.getProp<force>(a));
-	vd.getProp<red>(a) = norm(acc);
+	Point<3,real_number> acc(vd.template getProp<force>(a));
+	vd.template getProp<red>(a) = norm(acc);
 
-	Point<3,real_number> vel(vd.getProp<velocity>(a));
-	vd.getProp<red2>(a) = norm(vel);
+	Point<3,real_number> vel(vd.template getProp<velocity>(a));
+	vd.template getProp<red2>(a) = norm(vel);
 }
 
 void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_number & max_vel)
@@ -426,7 +427,7 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 	// Calculate the maximum acceleration
 	auto part = vd.getDomainIteratorGPU();
 
-	max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
 
 	max_acc = reduce_local<red,_max_>(vd);
 	max_vel = reduce_local<red2,_max_>(vd);
@@ -529,7 +530,7 @@ void verlet_int(particles & vd, real_number dt)
 	real_number dt205 = dt*dt*0.5;
 	real_number dt2 = dt*2.0;
 
-	verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
+	CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205);
 
 	// remove the particles marked
 	remove_marked<red>(vd);
@@ -605,7 +606,7 @@ void euler_int(particles & vd, real_number dt)
 
 	real_number dt205 = dt*dt*0.5;
 
-	euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205);
 
 	// remove the particles
 	remove_marked<red>(vd);
@@ -681,7 +682,7 @@ inline void sensor_pressure(Vector & vd,
         // if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			sensor_pressure_gpu<<<1,1>>>(vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
 
 			vd.merge<Pressure>(NN);
 
@@ -720,17 +721,19 @@ int main(int argc, char* argv[])
     // initialize the library
 	openfpm_init(&argc,&argv);
 
+#if !defined(CUDA_ON_CPU) && !defined(__HIP__)
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+#endif
 
 	// It contain for each time-step the value detected by the probes
 	openfpm::vector<openfpm::vector<real_number>> press_t;
 	openfpm::vector<Point<3,real_number>> probes;
 
-	probes.add({0.8779,0.3,0.02});
-	probes.add({0.754,0.31,0.02});
+	probes.add({0.8779f,0.3f,0.02f});
+	probes.add({0.754f,0.31f,0.02f});
 
 	// Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
-	Box<3,real_number> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.511});
+	Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f});
 	size_t sz[3] = {413,179,133};
 
 	// Fill W_dap
@@ -748,7 +751,7 @@ int main(int argc, char* argv[])
 
 	// You can ignore all these dp/2.0 is a trick to reach the same initialization
 	// of Dual-SPH that use a different criteria to draw particles
-	Box<3,real_number> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0});
+	Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f});
 
 	// return an iterator to the fluid particles to add to vd
 	auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box);
@@ -797,12 +800,12 @@ int main(int argc, char* argv[])
 	}
 
 	// Recipient
-	Box<3,real_number> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0});
-	Box<3,real_number> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0});
+	Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f});
+	Box<3,real_number> recipient2({dp,dp,dp},{1.6f-dp/2.0f,0.67f-dp/2.0f,0.4f+dp/2.0f});
 
-	Box<3,real_number> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0});
-	Box<3,real_number> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0});
-	Box<3,real_number> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45});
+	Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f});
+	Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f});
+	Box<3,real_number> obstacle3({0.9f+dp,0.24f,0.0f},{1.02f,0.36f,0.45f});
 
 	openfpm::vector<Box<3,real_number>> holes;
 	holes.add(recipient2);
@@ -869,8 +872,7 @@ int main(int argc, char* argv[])
 	// Ok the initialization is done on CPU on GPU we are doing the main loop, so first we offload all properties on GPU
 
 	vd.hostToDevicePos();
-	vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity>();
-
+	vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity,velocity_prev>();
 
 	vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
@@ -889,7 +891,6 @@ int main(int argc, char* argv[])
 		Vcluster<> & v_cl = create_vcluster();
 		timer it_time;
 
-
 		////// Do rebalancing every 200 timesteps
 		it_reb++;
 		if (it_reb == 300)
@@ -913,7 +914,7 @@ int main(int argc, char* argv[])
 
 		// it sort the vector (doesn not seem to produce some advantage)
 		// note force calculation is anyway sorted calculation
-		vd.make_sort(NN);
+		//vd.make_sort(NN);
 
 		// Calculate pressure from the density
 		EqState(vd);
@@ -922,10 +923,10 @@ int main(int argc, char* argv[])
 
 		vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
-
 		// Calc forces
 		calc_forces(vd,NN,max_visc,cnt);
 
+
 		// Get the maximum viscosity term across processors
 		v_cl.max(max_visc);
 		v_cl.execute();
diff --git a/example/Vector/7_SPH_dlb_opt/Makefile b/example/Vector/7_SPH_dlb_opt/Makefile
index 4fe69bac6969ee9b0ac435a3a7c2d1874ae77fe4..f11bc9dfcdfef0570796ec719e7aed00afff356c 100644
--- a/example/Vector/7_SPH_dlb_opt/Makefile
+++ b/example/Vector/7_SPH_dlb_opt/Makefile
@@ -13,14 +13,14 @@ sph_dlb_test: OPT += -DTEST_RUN
 sph_dlb_test: sph_dlb
 
 %.o: %.cpp
-	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 sph_dlb: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 
 run: sph_dlb_test
-	mpirun -np 2 ./sph_dlb
+	mpirun --oversubscribe -np 2 ./sph_dlb
 
 .PHONY: clean all run
 
diff --git a/example/Vector/8_DEM/Makefile b/example/Vector/8_DEM/Makefile
index c72e331b7126b8344d8b0856a6154bfd0fa04613..180412c2047b05c85a3b2cfc23c9cd50ada472c9 100644
--- a/example/Vector/8_DEM/Makefile
+++ b/example/Vector/8_DEM/Makefile
@@ -14,14 +14,14 @@ dem_test: OPT += -DTEST_RUN
 dem_test: all
 
 %.o: %.cpp
-	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 dem: $(OBJ)
 	$(CC) -o $@ $^ $(OPT) $(CFLAGS) $(LIBS_PATH) $(LIBS)
 
 
 run: dem_test
-	mpirun -np 2 ./dem
+	mpirun --oversubscribe -np 2 ./dem
 
 .PHONY: clean all run
 
diff --git a/example/Vector/9_gpu_cuda_interop/Makefile b/example/Vector/9_gpu_cuda_interop/Makefile
index 696f22c3ce1ffb6410279337197559632cbb1610..0f90d34d705ce4698615cc1b528762eb1163cd13 100644
--- a/example/Vector/9_gpu_cuda_interop/Makefile
+++ b/example/Vector/9_gpu_cuda_interop/Makefile
@@ -1,26 +1,31 @@
 include ../../example.mk
-
-ifeq (, $(shell which nvcc))
-        CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+ifdef CUDA_ON_CPU
+	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
         CUDA_OPTIONS=
 else
-        CUDA_CC=nvcc -ccbin=mpic++
-        CUDA_CC_LINK=nvcc -ccbin=mpic++
-        CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+	ifeq (, $(shell which nvcc))
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+        	CUDA_OPTIONS=
+	else
+        	CUDA_CC=nvcc -ccbin=mpic++
+        	CUDA_CC_LINK=nvcc -ccbin=mpic++
+        	CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+	endif
 endif
 
-
 OBJ = main.o
 
 gpu_interop:
 
 %.o: %.cu
-	$(CUDA_CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
-	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
 gpu_interop: $(OBJ)
 	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
@@ -28,7 +33,7 @@ gpu_interop: $(OBJ)
 all: gpu_interop
 
 run: gpu_interop
-	mpirun -np 2 ./gpu_interop
+	mpirun --oversubscribe -np 2 ./gpu_interop
 
 .PHONY: clean all run
 
diff --git a/gdbgui b/gdbgui
new file mode 160000
index 0000000000000000000000000000000000000000..d39f0e88c48e33fc3fc5aa9f98171d4eae4914cb
--- /dev/null
+++ b/gdbgui
@@ -0,0 +1 @@
+Subproject commit d39f0e88c48e33fc3fc5aa9f98171d4eae4914cb
diff --git a/gpl-3.0.txt b/gpl-3.0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7
--- /dev/null
+++ b/gpl-3.0.txt
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/install b/install
index 89ba02f1301c46a09fc3136d6976557df8f7441c..fff9a319fedad10cbe57393abada4183b8ca4b91 100755
--- a/install
+++ b/install
@@ -1,4 +1,5 @@
 #! /bin/bash
+
 source script/help
 source script/discover_os
 source script/show_solutions
@@ -8,6 +9,8 @@ source script/set_mpi
 source script/conf_PETSC
 source script/update_openfpm
 source script/solve_python
+source script/install_gdbgui
+
 
 ### switch to the branch
 #b_switch=`git rev-parse --abbrev-ref HEAD`
@@ -23,7 +26,7 @@ fi
 ## Check that your home is not empty
 
 if [ x"$HOME" == x"" ]; then
-	 echo -e "Your $HOME folder is empty \033[91;5;1m FAILED \033[0m"
+	 echo -e "Your \$HOME folder is empty \033[91;5;1m FAILED \033[0m"
 	 exit 1
 fi
 
@@ -34,8 +37,9 @@ nomake=0
 update_openfpm=0
 upgrade_openfpm=0
 gpu_support=0
+no_lin=0
 
-while getopts di:smghc:nu FLAG; do
+while getopts di:v:smghc:nul FLAG; do
 	case $FLAG in
 	    d)
 	      echo "Disable depencencies installation"
@@ -67,8 +71,16 @@ while getopts di:smghc:nu FLAG; do
 	      ;;
 	    n)
           echo "Upgrading openfpm"
-	      upgrade_openfpm=1
+              upgrade_openfpm=1
+              ;;
+            l)
+	      echo "No linear algebra"
+	      no_lin=1
 	      ;;
+            v)
+              echo "Relocating openfpm_vars file $OPTARG"
+              VARS_FILE_LOCATION=$OPTARG
+              ;;
 	    h)  #show help
       	  HELP
           ;;
@@ -216,7 +228,11 @@ if [ ! -d "$i_dir/PETSC" -o ! -f "$i_dir/PETSC/include/petsc.h" -o ! -d "$i_dir/
     if [ $sq -eq 0 ]; then
       read inst_lin_alg
     else
-      inst_lin_alg="y"
+      if [ $no_lin -eq 1 ]; then
+	  inst_lin_alg="n"
+      else
+          inst_lin_alg="y"
+      fi
     fi
 
     ### PETSC require Python
@@ -376,7 +392,7 @@ else
                 echo "Error the installation of VCDEVEL failed"
                 exit 1
             fi
-            ./script/install_VCDEVEL.sh $i_dir $ncore
+            ./script/install_VCDEVEL.sh $i_dir $ncore $CC $CXX
             configure_options=" $configure_options --with-vcdevel=$i_dir/VCDEVEL "
             VCDEVEL_installed=1
         elif [ $conf_err -ne 0 ]; then
@@ -400,25 +416,35 @@ fi
 install_base=$(cat install_dir)
 openmp_flags="$(cat openmp_flags)"
 cuda_include_dirs=$(cat cuda_include)
-mpi_include_dirs=$(cat mpi_include)
-mpi_libs=$(cat mpi_libs)
+hip_enabled=$(cat hip_enabled)
+if [ x"$hip_enabled" == x"1" ]; then
+	mpi_include_dirs=$(cat mpi_include)
+	mpi_libs=$(cat mpi_libs)
+fi
+cuda_on_cpu=$(cat cuda_on_cpu)
+optional_boost=$(cat optional_boost_libs)
 
 if [ -d "$i_dir/HDF5/lib" ]; then
   hdf5_lib=$i_dir/HDF5/lib
+  hdf5_lib_dir=-L$i_dir/HDF5/lib
 elif [ -d "$i_dir/HDF5/lib64" ]; then
   hdf5_lib=$i_dir/HDF5/lib64
+  hdf5_lib_dir=-L$i_dir/HDF5/lib64
 fi
 
-echo "INCLUDE_PATH=$cuda_include_dirs -Wno-deprecated-declarations $openmp_flags  -I.  -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/VCDEVEL/include  -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include  $lin_alg_inc" > example.mk
-echo "LIBS_PATH=$openmp_flags -L$install_base/openfpm_devices/lib -L$install_base/openfpm_pdata/lib  -L$install_base/openfpm_vcluster/lib -L$i_dir/VCDEVEL/lib  -L$i_dir/METIS/lib -L$i_dir/PARMETIS/lib  -L$i_dir/BOOST/lib -L$hdf5_lib -L$i_dir/LIBHILBERT/lib  $lin_alg_dir " >> example.mk
+echo "INCLUDE_PATH=$mpi_include_dirs $cuda_include_dirs -Wno-deprecated-declarations $openmp_flags  -I.  -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/VCDEVEL/include  -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include  $lin_alg_inc" > example.mk
+echo "LIBS_PATH=$mpi_libs $openmp_flags -L$install_base/openfpm_devices/lib -L$install_base/openfpm_pdata/lib  -L$install_base/openfpm_vcluster/lib -L$i_dir/VCDEVEL/lib  -L$i_dir/METIS/lib -L$i_dir/PARMETIS/lib  -L$i_dir/BOOST/lib $hdf5_lib_dir -L$i_dir/LIBHILBERT/lib  $lin_alg_dir " >> example.mk
+if [ x"$cuda_on_cpu" == x"YES" ]; then
+   echo "CUDA_ON_CPU=YES" >> example.mk
+fi
 if [ x"$gpu_support" == x"1" ]; then
-    echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk
-    echo "LIBS_SE2=-lvcluster -lofpmmemory_se2 -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -lboost_filesystem -lboost_system" >> example.mk
+    echo "LIBS=$mpi_libs -lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk
+    echo "LIBS_CUDA_ON_CPU=-lvcluster_cuda_on_cpu -lofpmmemory_cuda_on_cpu -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -lboost_filesystem -lboost_system -lboost_context" >> example.mk
 else
-    echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk
-    echo "LIBS_SE2=-lvcluster -lofpmmemory_se2 -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $lin_alg_lib -lboost_filesystem -lboost_system" >> example.mk
+    echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $lin_alg_lib -ldl -lboost_filesystem -lboost_system $optional_boost" >> example.mk
+    echo "LIBS_CUDA_ON_CPU=\$(LIBS)" >> example.mk
 fi
-echo "INCLUDE_PATH_NVCC=-Xcompiler="-Wno-deprecated-declarations" $(cat openmp_flags) "$(cat cuda_options)" -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include  $lin_alg_inc" >> example.mk
+echo "INCLUDE_PATH_NVCC=-Xcompiler="-Wno-deprecated-declarations" $(cat openmp_flags) "$(cat cuda_options)" $mpi_include_dirs -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include  $lin_alg_inc" >> example.mk
 cp example.mk src/example.mk
 cp example.mk example/example.mk
 
@@ -431,6 +457,13 @@ if [ $? -ne 0 ]; then
   conf_err=1
 fi
 
+################ Parallel debugger #######################
+
+echo "Installing parallel debugger"
+install_gdbgui $i_dir
+
+###########################################################
+
 echo "Command used to configure"
 echo ""
 echo -e "\033[1m ./configure $options $configure_options \033[0m "
@@ -444,6 +477,7 @@ fi
 
 bash_path="export PATH=\""
 
+
 echo -e "\033[1;34;5m ---------------------------------------  \033[0m"
 echo -e "\033[1;34;5m --------- INSTALLATION REPORT ---------  \033[0m"
 echo ""
@@ -532,6 +566,7 @@ if [ x"$platform" == x"cygwin" ]; then
 	bash_path="$bash_path:$i_dir/BOOST/bin:$i_dir/HDF5/bin"
 fi
 
+bash_path="$bash_path:$install_base/gdbgui/bin:"
 bash_path="$bash_path:\$PATH\""
 bash_library="$bash_library\""
 
@@ -539,11 +574,16 @@ bash_library="$bash_library\""
 
 # in cygwin we have to add to PATH additional directories
 
-echo "$bash_path" > $HOME/openfpm_vars
-echo "$bash_library" >> $HOME/openfpm_vars
+if [ x"$VARS_FILE_LOCATION" == x"" ]; then
+	VARS_FILE_LOCATION=$HOME
+fi
+
+echo "$bash_path" > $VARS_FILE_LOCATION/openfpm_vars
+echo "$bash_library" >> $VARS_FILE_LOCATION/openfpm_vars
+echo "export PURE_PYTHON=1" >> $VARS_FILE_LOCATION/openfpm_vars
 
 if [ x"$platform" = x"osx" ]; then
-    echo "TMPDIR=/tmp/" >> $HOME/openfpm_vars
+    echo "TMPDIR=/tmp/" >> $VARS_FILE_LOCATION/openfpm_vars
 fi
 
 echo -e "$installation_report"
@@ -562,12 +602,12 @@ fi
 
 echo ""
 if [ x"$platform" = x"linux" ]; then
-  echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $HOME/.bashrc"
+  echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $VARS_FILE_LOCATION/.bashrc"
 else
-  echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $HOME/.bash_profile"
+  echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $VARS_FILE_LOCATION/.bash_profile"
 fi
 echo ""
-echo -e "\033[1m source $HOME/openfpm_vars \033[0m"
+echo -e "\033[1m source $VARS_FILE_LOCATION/openfpm_vars \033[0m"
 echo ""
 echo ""
 echo -e "Remember to do: \033[91;5;1m make install \033[0m"
diff --git a/install_MPI_mpich.sh b/install_MPI_mpich.sh
index d346eb6fa90ce6970834fd451438b2329cdeb7ff..a8ce42181005dc020be834e220e144a2f34c83fe 100755
--- a/install_MPI_mpich.sh
+++ b/install_MPI_mpich.sh
@@ -3,9 +3,32 @@
 # check if the directory $1/MPI exist
 
 if [ -d "$1/MPI" ]; then
-  echo "MPI already installed"
-  exit 0
+        version=$(cat $1/MPI/version)
+        if [ x"$version" != x"10"  ]; then
+            echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
+            echo -e "\033[1;34;5m  MPICH has been updated to version 3.3.0, the component will be updated automatically      \033[0m"
+            echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
+            sleep 5
+            rm -rf $1/MPI/include
+            rm -rf $1/MPI/lib
+            rm -rf $1/MPI/bin
+            rm -rf $1/MPI/etc
+            rm -rf $1/MPI/share
+            rm -rf $1/MPI
+            rm -rf $1/HDF5
+            rm -rf $1/ZLIB
+            rm -rf $1/PARMETIS
+            rm -rf $1/PETSC
+            rm -rf $1/TRILINOS
+            rm -rf $1/HYPRE
+            rm -rf $1/MUMPS
+            rm -rf $1/SUPERLU_DIST
+        else
+                echo "MPI already installed"
+                exit 0
+        fi
 fi
+
 rm -rf mpich-3.3
 rm mpich-3.3.tar.gz
 wget http://ppmcore.mpi-cbg.de/upload/mpich-3.3.tar.gz
@@ -37,5 +60,5 @@ make -j $2
 make install
 
 # Mark the installation
-echo 6 > $1/MPI/version
+echo 10 > $1/MPI/version
 
diff --git a/openfpm_data b/openfpm_data
index 42396dea71e4df42c7aa95ca2c394db9317927c1..d78de3919144686d1e88a2c46a88ad3fa2a79043 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 42396dea71e4df42c7aa95ca2c394db9317927c1
+Subproject commit d78de3919144686d1e88a2c46a88ad3fa2a79043
diff --git a/openfpm_devices b/openfpm_devices
index aa00b03d64ff91d34060ae4d8b01a5b19896f5a7..a7c955b6720dd482b2d2258b27dd4e6148230792 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit aa00b03d64ff91d34060ae4d8b01a5b19896f5a7
+Subproject commit a7c955b6720dd482b2d2258b27dd4e6148230792
diff --git a/openfpm_io b/openfpm_io
index 5e408377640e8bd56b7d5e1905aca49c34655656..88095d3038172a62770f6320cfce84f82f25fed0 160000
--- a/openfpm_io
+++ b/openfpm_io
@@ -1 +1 @@
-Subproject commit 5e408377640e8bd56b7d5e1905aca49c34655656
+Subproject commit 88095d3038172a62770f6320cfce84f82f25fed0
diff --git a/openfpm_pdata.doc b/openfpm_pdata.doc
index 1ecffa9ccc680a2e2aafe5aa285dbff1a449d816..c29086dba9ff93b6146095ab3273bb9128d9b357 100644
--- a/openfpm_pdata.doc
+++ b/openfpm_pdata.doc
@@ -38,7 +38,7 @@ PROJECT_NAME           = "OpenFPM_pdata"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 3.1.0
+PROJECT_NUMBER         = 3.3.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/openfpm_vcluster b/openfpm_vcluster
index 98c601ed8d8b6f17c349ea479aa98ba22b90a069..974a494a7cac5d47ae04921393d09f57fd3039f8 160000
--- a/openfpm_vcluster
+++ b/openfpm_vcluster
@@ -1 +1 @@
-Subproject commit 98c601ed8d8b6f17c349ea479aa98ba22b90a069
+Subproject commit 974a494a7cac5d47ae04921393d09f57fd3039f8
diff --git a/script/detect_gcc b/script/detect_gcc
index 24a5a5347a15799f827eca81ccbc4c82faefa6b2..0cbcf16b8e7531f342245f6b5388a56992d381ac 100755
--- a/script/detect_gcc
+++ b/script/detect_gcc
@@ -218,7 +218,14 @@ function detect_compiler()
         echo "Two different valid compilers has been found please choose one"
         commands[0]="icpc"
         commands[1]="g++"
-        possible_solutions "${commands[@]}"
+
+	choice=$(cat default_choice_compiler)
+	if [ -f default_choice_compiler ]; then
+		possible_solutions_command=$(cat default_choice_compiler)
+		echo "Choosen: $possible_solutions_command"
+	else
+		possible_solutions "${commands[@]}"
+	fi
         if [ x"$possible_solutions_command" == x"icpc" ]; then
           CXX=icpc
           CC=icc
diff --git a/script/download_MPI.sh b/script/download_MPI.sh
index cd8113605a9055d900406a0d9420b1dd0ae286b9..87c15208e4449bf7ca1a8b0a74bcf82f1c7a880f 100755
--- a/script/download_MPI.sh
+++ b/script/download_MPI.sh
@@ -2,8 +2,8 @@
 
 # check if the directory $1/MPI exist
 
-rm -rf openmpi-4.0.4
-rm openmpi-4.0.4.tar.gz
-wget http://ppmcore.mpi-cbg.de/upload/openmpi-4.0.4.tar.gz
-tar -xvf openmpi-4.0.4.tar.gz
+rm -rf openmpi-4.1.1
+rm openmpi-4.1.1.tar.gz
+wget http://ppmcore.mpi-cbg.de/upload/openmpi-4.1.1.tar.gz
+tar -xvf openmpi-4.1.1.tar.gz
 
diff --git a/script/help b/script/help
index d6492daf9208d3447c4ab4059a8248f1a61941e9..8ef53c443a0d832020cc14c62d1e0b50b92c0739 100755
--- a/script/help
+++ b/script/help
@@ -14,6 +14,7 @@ function HELP {
   echo -e "    \033[1;34m-c\033[0m foward this options to configure"
   echo -e "    \033[1;34m-h\033[0m Displays this help message"\\n
   echo -e "    \033[1;34m-m\033[0m Skip compilation"\\n
+  echo -e "    \033[1;34m-l\033[0m do not install linear algebra"\\n
   echo -e " \033[1mExample:\033[0m ./install -i /dependencies/here -s -c\"some_options someother_option\""\\n
   exit 1
 }
diff --git a/script/install_BOOST.sh b/script/install_BOOST.sh
index 723da2c4458ce197211398b6687db9e0bad50675..f401534545f4d58be883d31bc027b75a976b7b82 100755
--- a/script/install_BOOST.sh
+++ b/script/install_BOOST.sh
@@ -7,9 +7,10 @@ if [ -d "$1/BOOST" ]; then
   exit 0
 fi
 
-wget http://ppmcore.mpi-cbg.de/upload/boost_1_72_0.tar.bz2
-tar -xvf boost_1_72_0.tar.bz2
-cd boost_1_72_0
+rm boost_1_75_0.tar.bz2
+wget http://ppmcore.mpi-cbg.de/upload/boost_1_75_0.tar.bz2
+tar -xvf boost_1_75_0.tar.bz2
+cd boost_1_75_0
 if [ x"$4" != x"" ]; then
 	if [ -f $HOME/user-config.jam ]; then
 		mv $HOME/user-config.jam $HOME/user-config.jam_bck
@@ -23,10 +24,10 @@ fi
 ./bootstrap.sh --with-toolset=$3
 mkdir $1/BOOST
 ./b2 -j $2 install --prefix=$1/BOOST
-rm -rf boost_1_72_0
+rm -rf boost_1_75_0
 
 if [ -f $HOME/user-config.jam_bck ]; then
 	mv $HOME/user-config.jam_bck $HOME/user-config.jam
 fi
-rm -rf boost_1_72_0.tar.bz2
+rm -rf boost_1_75_0.tar.bz2
 
diff --git a/script/install_EIGEN.sh b/script/install_EIGEN.sh
index b42a5ce9de72621f09bd991b7cc4384b44cf8841..342ee3426be9eb86afcd1ca99f70669915fb0dfd 100755
--- a/script/install_EIGEN.sh
+++ b/script/install_EIGEN.sh
@@ -32,4 +32,4 @@ rm -rf eigen-eigen-b3f3d4950030/
 touch $1/EIGEN/signature_of_eigen3_matrix_library
 
 # Mark the installation
-echo 2 > $1/EIGEN/version
+echo 2 > $1/EIGEN/version_eigen_lib
diff --git a/script/install_HDF5.sh b/script/install_HDF5.sh
index f44ac3df67fe6ff48e140fe16cd0577bf2d48fcb..6477e1869f69f6904cff24b7229249e0a5ff2725 100755
--- a/script/install_HDF5.sh
+++ b/script/install_HDF5.sh
@@ -24,7 +24,7 @@ if [ ! -d "$1/ZLIB"  -a x"$platform" != x"cygwin" ]; then
 
   CC=mpicc ./configure --prefix=$1/ZLIB
   make -j $2
-
+  cd ..
   if [ $? -eq 0 ]; then
     make check install
   else
@@ -37,9 +37,9 @@ else
 fi
 
 ### 1.8.19 does not compile on CYGWIN
-wget http://ppmcore.mpi-cbg.de/upload/hdf5-1.10.6.tar.gz
-tar -xf hdf5-1.10.6.tar.gz
-cd hdf5-1.10.6
+wget http://ppmcore.mpi-cbg.de/upload/hdf5-1.10.7.tar.gz
+tar -xf hdf5-1.10.7.tar.gz
+cd hdf5-1.10.7
 
 if [ x"$platform" != x"cygwin" ]; then
         CC=mpicc ./configure --with-zlib=$1/ZLIB --enable-parallel --prefix=$1/HDF5
@@ -54,4 +54,4 @@ if [ $? -ne 0 ]; then
     echo "HDF5 error installing"
     exit 0
 fi
-echo 2 > $1/HDF5/version
+echo 3 > $1/HDF5/version
diff --git a/script/install_MPI.sh b/script/install_MPI.sh
index 57ae2b2c7ee86c3580273dda3637ee7782c7017e..66fa52740360c63d247543d9664dc41d8006b642 100755
--- a/script/install_MPI.sh
+++ b/script/install_MPI.sh
@@ -8,7 +8,7 @@ if [ -d "$1/MPI" ]; then
 fi
 
 ./script/download_MPI.sh
-cd openmpi-4.0.4
+cd openmpi-4.1.1
 
 if [ -f ../mpi_add_options ]; then
 	mpi_options=$(cat ../mpi_add_options)
@@ -21,14 +21,14 @@ if [ x"$3" == x"1" ]; then
    # Detect where is nvcc
    cuda_location=$(dirname $(dirname $(which nvcc)) )
 
-   ./configure $mpi_options --with-cuda=$cuda_location --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8
+   ./configure --with-hwloc=internal --with-libevent=internal $mpi_options --with-cuda=$cuda_location --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8
 else
    echo "Installing MPI without GPU support"
-   ./configure $mpi_options --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8
+   ./configure --with-hwloc=internal --with-libevent=internal $mpi_options --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8
 fi
 make -j $2
 make install
 
 # Mark the installation
-echo 6 > $1/MPI/version
+echo 10 > $1/MPI/version
 
diff --git a/script/install_OPENBLAS.sh b/script/install_OPENBLAS.sh
index 59ba51656e322e61b7c52098c334e60b206f0c81..0b93190d753dafddaa576e3a94e49690be8a34b1 100755
--- a/script/install_OPENBLAS.sh
+++ b/script/install_OPENBLAS.sh
@@ -1,5 +1,9 @@
 #! /bin/bash
 
+source script/discover_os
+
+discover_os
+
 # check if the directory $1/OPENBLAS exist
 
 if [ -d "$1/OPENBLAS" ]; then
@@ -7,10 +11,19 @@ if [ -d "$1/OPENBLAS" ]; then
   exit 0
 fi
 
-rm -rf OpenBLAS-0.3.10.tar.gz
-wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.3.10.tar.gz
-tar -xf OpenBLAS-0.3.10.tar.gz
-cd OpenBLAS-0.3.10
+if [ x"$platform" == x"darwin" ]; then
+  rm -rf OpenBLAS-0.3.10
+  rm -rf OpenBLAS-0.3.10.tar.gz
+  wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.3.10.tar.gz
+  tar -xf OpenBLAS-0.3.10.tar.gz
+  cd OpenBLAS-0.3.10
+else
+  rm -rf OpenBLAS-0.3.13
+  rm -rf OpenBLAS-0.3.13.tar.gz
+  wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.3.13.tar.gz
+  tar -xf OpenBLAS-0.3.13.tar.gz
+  cd OpenBLAS-0.3.13
+fi
 
 #wget http://ppmcore.mpi-cbg.de/upload/openblas.diff
 #patch -p1 < openblas.diff
@@ -26,8 +39,9 @@ make install PREFIX=$1/OPENBLAS
 if [ ! "$(ls -A $1/OPENBLAS)" ]; then
    rm -rf $1/OPENBLAS
 else
+   rm -rf OpenBLAS-0.3.13
    rm -rf OpenBLAS-0.3.10
-   echo 2 > $1/OPENBLAS/version
+   echo 3 > $1/OPENBLAS/version
    exit 0
 fi
 
diff --git a/script/install_PETSC.sh b/script/install_PETSC.sh
index 6f98250c9714eb419784ec1e8b960d268862584b..12079ff67f90b1f8e8299405ba9316b1d600f8b3 100755
--- a/script/install_PETSC.sh
+++ b/script/install_PETSC.sh
@@ -19,7 +19,7 @@ source script/solve_python
 discover_os
 
 function test_configure_options() {
-  cd petsc-3.13.3
+  cd petsc-3.14.5
   $python_command ./configure COPTFLAGS="-O3 -g" CXXOPTFLAGS="-O3 -g" FOPTFLAGS="-O3 -g" $ldflags_petsc  --with-cxx-dialect=C++11 $petsc_openmp --with-mpi-dir=$mpi_dir $configure_options2 --with-debugging=0
   error=$?
   cd ..
@@ -49,14 +49,14 @@ fi
 
 #### Download and uncompress petsc
 
-rm petsc-lite-3.13.3.tar.gz
-rm -rf petsc-3.13.3
-wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.13.3.tar.gz
+rm petsc-lite-3.14.5.tar.gz
+rm -rf petsc-3.14.5
+wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.14.5.tar.gz
 if [ $? -ne 0 ]; then
   echo -e "\033[91;5;1m FAILED! Installation requires an Internet connection \033[0m"
   exit 1
 fi
-tar -xf petsc-lite-3.13.3.tar.gz
+tar -xf petsc-lite-3.14.5.tar.gz
 
 ####
 
@@ -143,15 +143,15 @@ if [ $error -eq 0 ]; then
 fi
 
 
-rm petsc-lite-3.13.3.tar.gz
-rm -rf petsc-3.13.3
-wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.13.3.tar.gz
+rm petsc-lite-3.14.5.tar.gz
+rm -rf petsc-3.14.5
+wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.14.5.tar.gz
 if [ $? -ne 0 ]; then
   echo -e "\033[91;5;1m FAILED! Installation requires an Internet connection \033[0m"
   exit 1
 fi
-tar -xf petsc-lite-3.13.3.tar.gz
-cd petsc-3.13.3
+tar -xf petsc-lite-3.14.5.tar.gz
+cd petsc-3.14.5
 
 if [ x"$CXX" != x"icpc" ]; then
 
@@ -176,7 +176,6 @@ else
   }
 
   $python_command ./configure COPTFLAGS="-O3 -g" CXXOPTFLAGS="-O3 -g" FOPTFLAGS="-O3 -g" $ldflags_petsc  --with-cxx-dialect=C++11 $petsc_openmp --with-mpi-dir=$mpi_dir $configure_options --prefix=$1/PETSC --with-debugging=0
-
 fi
 
 make all
@@ -187,7 +186,7 @@ if [ ! "$(ls -A $1/PETSC)" ]; then
    rm -rf $1/PETSC
 else
    #Mark the installation
-   echo 4 > $1/PETSC/version
+   echo 6 > $1/PETSC/version
    exit 0
 fi
 
diff --git a/script/install_SUITESPARSE.sh b/script/install_SUITESPARSE.sh
index d2ceafc1561aff46c37b7c2cae23cd0171e9c8ca..3aa4bedd8ed917a21ca460ddec73de0e27865fb5 100755
--- a/script/install_SUITESPARSE.sh
+++ b/script/install_SUITESPARSE.sh
@@ -6,7 +6,7 @@ source script/discover_os
 discover_os
 
 # check if the directory $1/SUITESPARSE exist
-
+rm -rf SuiteSparse-5.7.2
 if [ -d "$1/SUITESPARSE"  -a -f "$1/SUITESPARSE/include/umfpack.h" ]; then
   echo "SUITESPARSE is already installed"
   exit 0
@@ -35,16 +35,15 @@ if [ x"$platform" == x"cygwin" ]; then
 fi
 
 echo "Compiling SuiteSparse without CUDA (old variable $CUDA)"
-LDLIBS="$STS_LIB -lm" LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$1/OPENBLAS/lib"  make library -j $2 "CUDA=no" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=-lopenblas"
+LDLIBS="$STS_LIB -lm" LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$1/OPENBLAS/lib"  make library -j $2 "CC=$CC" "CXX=$CXX" "CUDA=no" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=-lopenblas"
 if [ $? != 0 ]; then
   echo "Failed to compile SuiteSparse"
   exit 1
 fi
 echo "Making library"
-make library "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK="
+make library "CC=$CC" "CXX=$CXX" "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK="
 echo "Making install"
-make install "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK="
+make install "CC=$CC" "CXX=$CXX" "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK="
 # Mark the installation
 echo 2 > $1/SUITESPARSE/version
-rm -rf SuiteSparse
 rm SuiteSparse-5.7.2.tar.gz
diff --git a/script/install_VCDEVEL.sh b/script/install_VCDEVEL.sh
index 177e622e095d35a58a5bf93cb840baa80e04d030..28ae094732ee7c9ef27dda04c4290c6f14cae9d3 100755
--- a/script/install_VCDEVEL.sh
+++ b/script/install_VCDEVEL.sh
@@ -13,7 +13,7 @@ tar -xf Vc-1.4.1.tar.gz
 cd Vc-1.4.1
 mkdir build
 cd build
-cmake -DCMAKE_INSTALL_PREFIX:PATH=$1/VCDEVEL ..
+cmake -DCMAKE_INSTALL_PREFIX:PATH=$1/VCDEVEL -DCMAKE_C_COMPILER=$3 -DCMAKE_CXX_COMPILER=$4 ..
 make
 make install
 
diff --git a/script/install_gdbgui b/script/install_gdbgui
new file mode 100755
index 0000000000000000000000000000000000000000..63d5098805dd320acba9bbeb871460ea7d282b48
--- /dev/null
+++ b/script/install_gdbgui
@@ -0,0 +1,162 @@
+#! /bin/bash
+
+function install_gdbgui() {
+source script/solve_pip3
+source script/solve_pip3_nox
+source script/solve_npm
+source script/solve_npm_yarn
+source script/show_solutions
+source script/solve_gdbserver
+source script/discover_package_manager
+source script/solve_nodejs
+source script/solve_python_devel
+source script/solve_llvm
+
+node_js_inst=0
+
+if [ x"$platform" == x"osx" ]; then
+	solve_llvm $platform
+
+	echo "OSX installing lldb-mi"
+	rm lldb-mi-master.tar.gz
+	wget http://ppmcore.mpi-cbg.de/upload/lldb-mi-master.tar.gz
+	tar -xf lldb-mi-master.tar.gz
+	cd lldb-mi-master
+	mkdir build
+	cd build
+	cmake ../. -DCMAKE_PREFIX_PATH=$1/LLDB-MI
+	make 
+fi
+
+solve_python_devel $platform
+
+if haveProg node; then
+
+  dgc_major=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\1/g')
+  dgc_middle=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\2/g')
+  dgc_minor=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\3/g')
+
+  if [[ ( $dgc_major -ge 10 ) && ( $dgc_middle -ge 14 ) ]]; then
+    echo -e "node\033[92;1m SUCCESS \033[0m"
+    node_js_inst=1
+  fi
+else
+  echo >&2
+  echo -e "node\033[91;5;1m FAILED \033[0m"
+  node_js_inst=0
+fi
+
+if [ x"$node_js_inst" = x"0" ]; then
+  echo "OpenFPM require node.js but it's not installed, searching a solution... "
+  solve_nodejs $platform
+
+  if haveProg node; then
+
+    dgc_major=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\1/g')
+    dgc_middle=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\2/g')
+    dgc_minor=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\3/g')
+
+    if [[ ( $dgc_major -ge 10 ) && ( $dgc_middle -ge 14 ) ]]; then
+      echo -e "node\033[92;1m SUCCESS \033[0m"
+    fi
+  else
+    echo >&2
+    echo -e "node\033[91;5;1m FAILED \033[0m"
+  fi
+
+fi
+
+if haveProg gdbserver; then
+  echo -e "gdbserver\033[92;1m SUCCESS \033[0m"
+else
+  echo >&2
+  echo -e "gdbserver\033[91;5;1m FAILED \033[0m"
+  echo "OpenFPM require gdb but it's not installed, searching a solution... "
+  solve_gdbserver $platform
+  if haveProg gdbserver; then
+    echo -e "gdbserver\033[92;1m SUCCESS \033[0m"
+  else
+    echo -e "gdbserver\033[91;5;1m FAILED \033[0m"
+    echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m"
+    return
+  fi
+fi
+
+if haveProg pip3; then
+  echo -e "pip3\033[92;1m SUCCESS \033[0m"
+else
+  echo >&2
+  echo -e "pip3\033[91;5;1m FAILED \033[0m"
+  echo "OpenFPM require pip3 but it's not installed, searching a solution... "
+  solve_pip3 $platform
+  if haveProg pip3; then
+    echo -e "pip3\033[92;1m SUCCESS \033[0m"
+  else
+    echo -e "pip3\033[91;5;1m FAILED \033[0m"
+    echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m"
+    return
+  fi
+fi
+
+if haveProg nox; then
+  echo -e "nox\033[92;1m SUCCESS \033[0m"
+else
+  echo >&2
+  echo -e "nox\033[91;5;1m FAILED \033[0m"
+  echo "OpenFPM require nox but it's not installed, searching a solution... "
+  solve_pip3_nox $platform
+  if haveProg nox; then
+    echo -e "nox\033[92;1m SUCCESS \033[0m"
+  else
+    echo -e "nox\033[91;5;1m FAILED \033[0m"
+    echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m"
+    return
+  fi
+fi
+
+if haveProg npm; then
+  echo -e "npm\033[92;1m SUCCESS \033[0m"
+else
+  echo >&2
+  echo -e "npm\033[91;5;1m FAILED \033[0m"
+  echo "OpenFPM require nox but it's not installed, searching a solution... "
+  solve_npm $platform
+  if haveProg npm; then
+    echo -e "npm\033[92;1m SUCCESS \033[0m"
+  else
+    echo -e "npm\033[91;5;1m FAILED \033[0m"
+    echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m"
+    return
+  fi
+fi
+
+
+if haveProg yarn; then
+  echo -e "yarn\033[92;1m SUCCESS \033[0m"
+else
+  echo >&2
+  echo -e "yarn\033[91;5;1m FAILED \033[0m"
+  echo "OpenFPM require nox but it's not installed, searching a solution... "
+  solve_npm_yarn $platform
+  if haveProg yarn; then
+    echo -e "yarn\033[92;1m SUCCESS \033[0m"
+  else
+    echo -e "yarn\033[91;5;1m FAILED \033[0m"
+    echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m"
+    return
+  fi
+fi
+
+cd gdbgui
+cd gdbgui-mpi
+./compile.sh
+cd ..
+cd ..
+
+cd gdbgui
+nox -s build_executable_current_platform
+cd ..
+
+}
+
+
diff --git a/script/install_parallel_debugger b/script/install_parallel_debugger
new file mode 100755
index 0000000000000000000000000000000000000000..13c7409566d1816d86f106121631668f7c8d1b83
--- /dev/null
+++ b/script/install_parallel_debugger
@@ -0,0 +1,17 @@
+#! /bin/bash
+source script/discover_os
+discover_os
+
+if [ x"$platform" == x"$osx" ]; then
+	exit 0
+fi
+
+if [ -f gdbgui/executable/linux/gdbgui_0.14.0.2 ]; then
+	install_d=$(cat install_dir)
+	mkdir -p "$install_d/gdbgui/bin"
+	cp gdbgui/executable/linux/gdbgui_0.14.0.2 "$install_d/gdbgui/bin/gdbgui"
+	cp gdbgui/gdbgui-mpi/print_nodes "$install_d/gdbgui/bin"
+	cp gdbgui/gdbgui-mpi/launch_mpi_debugger "$install_d/gdbgui/bin"
+	cp gdbgui/gdbgui-mpi/launch_gdb_server "$install_d/gdbgui/bin"
+fi
+
diff --git a/script/pre_req b/script/pre_req
index 755cec9e36ba5c6916090811a6f0d7801beca53b..f5be531abb13a93122e0ae0e1fa9f7445203902b 100755
--- a/script/pre_req
+++ b/script/pre_req
@@ -220,6 +220,9 @@ if haveProg mpirun; then
 		if [ x"$possible_solutions_ret" == x"0" ]; then
 			get_openmpi_compilation_options
 			call_test_working_mpi_options=1
+                elif [ x"$possible_solutions_ret" == x"1" ]; then
+                        call_test_working_mpi_options=1
+
 		fi
 	else
 		echo "OpenMPI is CUDA aware"
diff --git a/script/remove_old b/script/remove_old
index 25e6e1ef114eaeeae0c311a7c4e37e90548156bf..ce0b9a91d8eea2654be6c6f9839960010eb95319 100755
--- a/script/remove_old
+++ b/script/remove_old
@@ -76,10 +76,10 @@ function remove_old()
     ## Check the installed version of the dependencies
 
     if [ -d $1/BOOST ]; then
-    	is_update=$(cat $1/BOOST/include/boost/version.hpp | grep "#define BOOST_VERSION 107200")
+    	is_update=$(cat $1/BOOST/include/boost/version.hpp | grep "#define BOOST_VERSION 107500")
     	if [ x"$is_update" == x"" ]; then
             echo -e "\033[1;34;5m  --------------------------------------------------------------------------- \033[0m"
-            echo -e "\033[1;34;5m  Boost has been updated to 1.72, the component will be updated automatically \033[0m"
+            echo -e "\033[1;34;5m  Boost has been updated to 1.75, the component will be updated automatically \033[0m"
             echo -e "\033[1;34;5m  --------------------------------------------------------------------------- \033[0m"
             sleep 5
             rm -rf $1/BOOST/include
@@ -143,9 +143,9 @@ function remove_old()
 
     if [ -d $1/PETSC ]; then
         version=$(cat $1/PETSC/version)
-        if [ x"$version" != x"4"  ]; then
+        if [ x"$version" != x"6"  ]; then
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
-            echo -e "\033[1;34;5m  PETSC has been updated to version 3.13.3, the component will be updated automatically      \033[0m"
+            echo -e "\033[1;34;5m  PETSC has been updated to version 3.14.5, the component will be updated automatically      \033[0m"
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
             sleep 5
             rm -rf $1/PETSC
@@ -158,9 +158,9 @@ function remove_old()
 
     if [ -d $1/HDF5 ]; then
         version=$(cat $1/HDF5/version)
-        if [ x"$version" != x"2"  ]; then
+        if [ x"$version" != x"3"  ]; then
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
-            echo -e "\033[1;34;5m  HDF5 has been updated to version 1.10.6, the component will be updated automatically   \033[0m"
+            echo -e "\033[1;34;5m  HDF5 has been updated to version 1.10.7, the component will be updated automatically   \033[0m"
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
             sleep 5
 	    rm -rf $1/HDF5
@@ -169,9 +169,9 @@ function remove_old()
 
     if [ -d $1/MPI ]; then
         version=$(cat $1/MPI/version)
-        if [ x"$version" != x"6"  ]; then
+        if [ x"$version" != x"10"  ]; then
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
-            echo -e "\033[1;34;5m  MPI has been updated to version 4.0.4, the component will be updated automatically      \033[0m"
+            echo -e "\033[1;34;5m  MPI has been updated to version 4.1.1, the component will be updated automatically      \033[0m"
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
             sleep 5
             rm -rf $1/MPI/include
@@ -192,7 +192,7 @@ function remove_old()
     fi
 
     if [ -d $1/EIGEN ]; then
-        version=$(cat $1/EIGEN/version)
+        version=$(cat $1/EIGEN/version_eigen_lib)
         if [ x"$version" != x"2"  ]; then
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------- \033[0m"
             echo -e "\033[1;34;5m  EIGEN has been updated to 3.3.5 , the component will be updated automatically    \033[0m"
@@ -207,7 +207,7 @@ function remove_old()
 
     if [ -d $1/OPENBLAS ]; then
         version=$(cat $1/OPENBLAS/version)
-        if [ x"$version" != x"2"  ]; then
+        if [ x"$version" != x"3"  ]; then
                 echo -e "\033[1;34;5m  ---------------------------------------------------------------------- \033[0m"
                 echo -e "\033[1;34;5m  OPENBLAS has been updated, the component will be updated automatically    \033[0m"
                 echo -e "\033[1;34;5m  ---------------------------------------------------------------------- \033[0m"
diff --git a/script/set_mpi b/script/set_mpi
index 024faf548033f4952dd569beaaf2a068214f4ea3..ef06cede4b23f3ebeba4546bff627ce039d175bf 100755
--- a/script/set_mpi
+++ b/script/set_mpi
@@ -4,9 +4,9 @@ function set_mpi()
 {
   if [ x"$MPI_valid" == x"yes" ]; then
 	if [ $is_mpi_openmpi -eq 1 ]; then
-		configure_options="$configure_options CXX=mpic++ --with-mpivendor=openmpi"
+		configure_options="$configure_options --with-mpivendor=openmpi"
 	else
-        	configure_options="$configure_options CXX=mpic++ "
+        	configure_options="$configure_options "
 	fi
   else
         if [ x"$platform" == x"cygwin" ]; then
@@ -19,7 +19,7 @@ function set_mpi()
         fi
         MPI_installed=1
         export PATH="$1/MPI/bin:$PATH"
-        configure_options="$configure_options CXX=mpic++ --with-mpivendor=openmpi"
+        configure_options="$configure_options --with-mpivendor=openmpi"
   fi
 
 }
@@ -43,52 +43,4 @@ function get_openmpi_compilation_options()
 	done
 }
 
-function test_working_mpi_options()
-{
-	script/download_MPI.sh
-	cd openmpi-3.1.3
-	openmpi_working_options=()
-        for opt in ${openmpi_compilation_options[@]}; do
-	       # prefix and --with-cuda must be avoided
-
-		if [[ $opt == --with-cuda* ]]; then
-			continue;
-		fi
-
-                if [[ $opt == --prefix* ]]; then
-                        continue;
-                fi
-
-		if [[ $opt == --enable-mpi-fortran* ]]; then
-			continue;
-		fi
-
-                if [[ $opt == CC* ]]; then
-                        continue;
-                fi
-
-                if [[ $opt == CXX* ]]; then
-                        continue;
-                fi
-
-                if [[ $opt == FC* ]]; then
-                        continue;
-                fi
-
-                if [[ $opt == F77* ]]; then
-                        continue;
-                fi
-
-		echo "Testing ./configure --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $openmpi_working_options"
-                ./configure --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $openmpi_working_options $opt
-
-	       if [ $? -eq 0 ]; then
-	           openmpi_working_options="$openmpi_working_options $opt"
-	       fi
-	done
-
-	echo "OpenMPI working options: $openmpi_working_options"
-
-	cd ..
-}
 
diff --git a/script/show_solutions b/script/show_solutions
index d398e6525bc918b1df4e751bbfe79d8ad9921169..bb2fd0ff178083bf37a96984eceb375e5408392b 100755
--- a/script/show_solutions
+++ b/script/show_solutions
@@ -2,10 +2,6 @@
 
 function possible_solutions {
 
- if [ $sq -eq 1 ]; then
-   return
- fi
-
  sol=1
  echo "Possible solutions:"
  for com in "$@"
@@ -18,7 +14,16 @@ function possible_solutions {
  sol=`expr $sol + 1`
  echo "$sol exit the script"
  echo -e "\033[1;34;5mChoose the solution:\033[0m"
- read choose
+ if [ $sq -eq 1 ]; then
+   if [ -f default_choice ]; then
+     choose=$(cat default_choice)
+   else
+     echo "No solution choosen, continue ... "
+     return
+   fi
+ else
+   read choose
+ fi
  if [ x"$choose" = x"$sol" ]; then
   exit 1
  fi
diff --git a/script/solve_gdbserver b/script/solve_gdbserver
new file mode 100755
index 0000000000000000000000000000000000000000..a92893f127b4d5dac842896619333271b05fca70
--- /dev/null
+++ b/script/solve_gdbserver
@@ -0,0 +1,33 @@
+#! /bin/bash 
+
+function solve_gdbserver() {
+source script/show_solutions
+source script/discover_package_manager
+discover_package_manager $1
+pcman=$discover_package_manager_ret
+
+if [ x"$pcman" = x"" ]; then
+        exit 1
+fi
+
+if [ x"$1" = x"osx" ]; then
+        commands[0]="$pcman install gdbserver"
+        possible_solutions "${commands[@]}"
+elif [ x"$1" = x"linux"  ]; then
+	if [ x"$pcman" = x"zypper -n" ]; then
+		package_name=gdbserver
+	elif [ x"$pcman" = x"pacman" ]; then
+		package_name=gdb
+	elif [ x"$pcman" = x"apt-get" ]; then
+		package_name=gdbsever
+	else
+		package_name=gdb-gdbserver
+	fi
+
+        pc_install_command "$pcman"
+        commands[0]="su -c \"$pc_install_command_ret $package_name\""
+        commands[1]="sudo $pc_install_command_ret $package_name"
+        possible_solutions "${commands[@]}"
+fi
+}
+
diff --git a/script/solve_llvm b/script/solve_llvm
new file mode 100755
index 0000000000000000000000000000000000000000..794daa6aeed22d4db5c15470edb11fffc4a90688
--- /dev/null
+++ b/script/solve_llvm
@@ -0,0 +1,25 @@
+#! /bin/bash
+
+function solve_llvm() {
+source script/show_solutions
+source script/discover_package_manager
+discover_package_manager $1
+pcman=$discover_package_manager_ret
+
+if [ x"$pcman" = x"" ]; then
+        exit 1
+fi
+
+
+if [ x"$pcman" == x"brew" ]; then
+        base_llvm_pkg="llvm"
+fi
+
+if [ x"$1" = x"osx"  ]; then
+        pc_install_command "$pcman"
+        commands[0]="su -c \"$pc_install_command_ret $base_python_pkg \""
+        commands[1]="sudo $pc_install_command_ret $base_python_pkg "
+        possible_solutions "${commands[@]}"
+fi
+}
+
diff --git a/script/solve_nodejs b/script/solve_nodejs
new file mode 100755
index 0000000000000000000000000000000000000000..1f58a2e96ee411ad7cd7f8055940489b741f935f
--- /dev/null
+++ b/script/solve_nodejs
@@ -0,0 +1,26 @@
+#! /bin/bash
+
+function solve_nodejs() {
+source script/show_solutions
+source script/discover_package_manager
+discover_package_manager $1
+pcman=$discover_package_manager_ret
+
+if [ x"$pcman" = x"" ]; then
+        exit 1
+fi
+
+if [ x"$pcman" == x"zypper -n" ]; then
+        base_pkg=nodejs10
+else
+        base_pkg=nodejs
+fi
+
+if [ x"$1" = x"linux"  ]; then
+        pc_install_command "$pcman"
+        commands[0]="su -c \"$pc_install_command_ret $base_pkg\""
+        commands[1]="sudo $pc_install_command_ret $base_pkg"
+        possible_solutions "${commands[@]}"
+fi
+}
+
diff --git a/script/solve_npm b/script/solve_npm
new file mode 100755
index 0000000000000000000000000000000000000000..80c466c6a5e7909b5d8ebf0b849a8ded81a041ff
--- /dev/null
+++ b/script/solve_npm
@@ -0,0 +1,24 @@
+
+#! /bin/bash 
+
+function solve_npm() {
+source script/show_solutions
+source script/discover_package_manager
+discover_package_manager $1
+pcman=$discover_package_manager_ret
+
+if [ x"$pcman" = x"" ]; then
+        exit 1
+fi
+
+if [ x"$1" = x"osx" ]; then 
+        commands[0]="$pcman install npm"
+        possible_solutions "${commands[@]}"
+elif [ x"$1" = x"linux"  ]; then
+	pc_install_command "$pcman"
+        commands[0]="su -c \"$pc_install_command_ret npm\""
+        commands[1]="sudo $pc_install_command_ret npm"
+        possible_solutions "${commands[@]}"
+fi 
+}
+
diff --git a/script/solve_npm_yarn b/script/solve_npm_yarn
new file mode 100755
index 0000000000000000000000000000000000000000..126d3423ac3d1fe88ca9e5b30290197141121a90
--- /dev/null
+++ b/script/solve_npm_yarn
@@ -0,0 +1,12 @@
+
+#! /bin/bash 
+
+function solve_npm_yarn() {
+source script/show_solutions
+
+pc_install_command "$pcman"
+commands[0]="su -c \"npm install yarn -g\""
+commands[1]="sudo npm install yarn -g"
+possible_solutions "${commands[@]}"
+}
+
diff --git a/script/solve_pip3 b/script/solve_pip3
new file mode 100755
index 0000000000000000000000000000000000000000..b3e67bcfdf7c0720f08a0784fa1f06a08f1dcf03
--- /dev/null
+++ b/script/solve_pip3
@@ -0,0 +1,29 @@
+
+#! /bin/bash 
+
+function solve_pip3() {
+source script/show_solutions
+source script/discover_package_manager
+discover_package_manager $1
+pcman=$discover_package_manager_ret
+
+if [ x"$pcman" = x"" ]; then
+        exit 1
+fi
+
+if [ x"$1" = x"osx" ]; then 
+        commands[0]="$pcman install python3-pip"
+        possible_solutions "${commands[@]}"
+elif [ x"$1" = x"linux"  ]; then
+	if [ x"$pcman" == x"pacman" ]; then
+		package_name=python-pip
+	else
+		package_name=python3-pip
+	fi
+	pc_install_command "$pcman"
+        commands[0]="su -c \"$pc_install_command_ret $package_name\""
+        commands[1]="sudo $pc_install_command_ret $package_name"
+        possible_solutions "${commands[@]}"
+fi 
+}
+
diff --git a/script/solve_pip3_nox b/script/solve_pip3_nox
new file mode 100755
index 0000000000000000000000000000000000000000..69a6faf88ec7b9190a6b4d2901beb9c4838bc653
--- /dev/null
+++ b/script/solve_pip3_nox
@@ -0,0 +1,12 @@
+
+#! /bin/bash 
+
+function solve_pip3_nox() {        
+source script/show_solutions
+
+
+commands[0]="su -c \"pip3 install nox\""
+commands[1]="sudo pip3 install nox"
+possible_solutions "${commands[@]}"
+}
+
diff --git a/script/solve_python b/script/solve_python
index 59afdc4c88a26360d3827938f18944a959a76ffd..1093519b73e8d3d3ff468ae3a948989b3aed836c 100755
--- a/script/solve_python
+++ b/script/solve_python
@@ -10,11 +10,12 @@ if [ x"$pcman" = x"" ]; then
         exit 1
 fi
 
+
 if [ x"$pcman" == x"apt-get" ]; then
-	apt-cache show python-is-python3 | grep -q "python-is-python3"
-	if [ $? -eq 0 ]; then
-		additional_python_pkg="python-is-python3"
-	fi
+        apt-cache show python-is-python3 | grep -q "python-is-python3"
+        if [ $? -eq 0 ]; then
+                additional_python_pkg="python-is-python3"
+        fi
         apt-cache show python | grep -q "python"
         if [ $? -eq 0 ]; then
                 base_python_pkg="python"
@@ -23,10 +24,15 @@ if [ x"$pcman" == x"apt-get" ]; then
         if [ $? -eq 0 ]; then
                 base_python_pkg="python2 $base_python_pkg"
         fi
+elif [ x"$pcman" == x"yum" ]; then
+        base_python_pkg="python"
+elif [ x"$pcman" == x"zypper -n" ]; then
+        base_python_pkg=python2
+        additional_python_pkg=python3
 fi
 
 if [ x"$1" = x"linux"  ]; then
-	pc_install_command "$pcman"
+        pc_install_command "$pcman"
         commands[0]="su -c \"$pc_install_command_ret $base_python_pkg $additional_python_pkg\""
         commands[1]="sudo $pc_install_command_ret $base_python_pkg $additional_python_pkg"
         possible_solutions "${commands[@]}"
diff --git a/script/solve_python_devel b/script/solve_python_devel
new file mode 100755
index 0000000000000000000000000000000000000000..29e9c7ab702902103fbaddd5e2edee262fe7329d
--- /dev/null
+++ b/script/solve_python_devel
@@ -0,0 +1,31 @@
+#! /bin/bash
+
+function solve_python_devel() {
+source script/show_solutions
+source script/discover_package_manager
+discover_package_manager $1
+pcman=$discover_package_manager_ret
+
+if [ x"$pcman" = x"" ]; then
+        exit 1
+fi
+
+if [ x"$pcman" == x"zypper -n" ]; then
+        if [ ! -f /usr/include/python3.6m/pyconfig.h ]; then
+                base_python_pkg=python2-devel
+                additional_python_pkg=python3-devel
+        else
+                return
+        fi
+else
+	return
+fi
+
+if [ x"$1" = x"linux"  ]; then
+        pc_install_command "$pcman"
+        commands[0]="su -c \"$pc_install_command_ret $base_python_pkg $additional_python_pkg\""
+        commands[1]="sudo $pc_install_command_ret $base_python_pkg $additional_python_pkg"
+        possible_solutions "${commands[@]}"
+fi
+}
+
diff --git a/src/Amr/grid_dist_amr.hpp b/src/Amr/grid_dist_amr.hpp
index 49e9173039a4b0d3a7316ad13aa0f42c5b9ce409..7c171bdc415092949eedc40c26fcd199d1bb1e17 100644
--- a/src/Amr/grid_dist_amr.hpp
+++ b/src/Amr/grid_dist_amr.hpp
@@ -145,7 +145,6 @@ class grid_dist_amr<dim,St,T,AMR_IMPL_TRIVIAL,Decomposition,Memory,device_grid>
 	//
 	openfpm::vector<grid_dist_id<dim,St,T,Decomposition,Memory,device_grid>,
 								 HeapMemory,
-								 typename memory_traits_lin<grid_dist_id<dim,St,T,Decomposition,Memory,device_grid>>::type,
 								 memory_traits_lin,
 								 openfpm::grow_policy_identity,STD_VECTOR> gd_array;
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7b72bc4d25a17149f8eb6184b2d84e3569874dce..3148669a74322677e37a8342ffde69d6b35562d7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,45 +4,107 @@ add_definitions(-DSCAN_WITH_CUB)
 
 ########################### Executables
 
-if(CUDA_FOUND)
+
+if(CUDA_FOUND OR CUDA_ON_CPU OR HIP_FOUND)
 	set(CUDA_SOURCES 
-				Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
-				Vector/cuda/vector_dist_gpu_MP_tests.cu 
-					 Vector/cuda/vector_dist_cuda_func_test.cu
-					 Decomposition/cuda/decomposition_cuda_tests.cu
-					 Vector/cuda/vector_dist_gpu_unit_tests.cu
-					 ../openfpm_devices/src/memory/CudaMemory.cu
-					 Decomposition/cuda/Domain_icells_cart_unit_test.cu
-					 Amr/tests/amr_base_gpu_unit_tests.cu)
+	    Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+	    Vector/cuda/vector_dist_gpu_MP_tests.cu
+	    Vector/cuda/vector_dist_cuda_func_test.cu
+	    Decomposition/cuda/decomposition_cuda_tests.cu
+	    Vector/cuda/vector_dist_gpu_unit_tests.cu
+	    Decomposition/cuda/Domain_icells_cart_unit_test.cu
+	    Amr/tests/amr_base_gpu_unit_tests.cu)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        add_definitions("-DBOOST_MPL_CFG_HAS_TYPEOF")
+endif()
+
+if (CUDA_ON_CPU)
+        add_definitions(-DCUDA_ON_CPU)
+        set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
+	set_source_files_properties(${CUDA_SOURCES} PROPERTIES COMPILE_FLAGS "-D__NVCC__ -DCUDART_VERSION=11000")
+        if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+                add_definitions("-x c++")
+        endif()
+endif()
+
+if ( HIP_ENABLE AND HIP_FOUND )
+
+        list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
+
+        if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+                list(APPEND HIP_HIPCC_FLAGS -O0)
+        endif()
+
+	list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__  -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0)
+        set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
+
+        set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+        hip_add_executable(pdata ${CUDA_SOURCES} ${OPENFPM_INIT_FILE} main.cpp
+							  Amr/grid_dist_amr_unit_tests.cpp
+                                                          Amr/tests/amr_base_unit_tests.cpp
+							  Debug/debug_test.cpp
+							  Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
+							  Grid/tests/grid_dist_id_unit_test.cpp
+							  Grid/tests/sgrid_dist_id_unit_tests.cpp
+							  Grid/tests/grid_dist_id_dlb_unit_test.cpp
+							  Grid/tests/staggered_grid_dist_unit_test.cpp
+							  Vector/tests/vector_dist_cell_list_tests.cpp
+							  Vector/tests/vector_dist_complex_prp_unit_test.cpp
+							  Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
+							  Vector/tests/vector_dist_MP_unit_tests.cpp
+							  Vector/tests/vector_dist_NN_tests.cpp
+							  Vector/tests/vector_dist_unit_test.cpp
+							  pdata_performance.cpp
+							  Decomposition/tests/CartDecomposition_unit_test.cpp
+							  Decomposition/tests/shift_vect_converter_tests.cpp
+							  Vector/performance/vector_dist_performance_util.cpp
+							  lib/pdata.cpp
+							  test_multiple_o.cpp
+							  )
+
+
+	hip_add_library(ofpm_pdata STATIC lib/pdata.cpp)
+
 else()
-	set(CUDA_SOURCES Vector/vector_dist_subset.hpp)
+
+	add_executable(pdata ${OPENFPM_INIT_FILE} ${CUDA_SOURCES} main.cpp
+							  Amr/grid_dist_amr_unit_tests.cpp
+							  Amr/tests/amr_base_unit_tests.cpp
+						  	  Debug/debug_test.cpp
+							  Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
+							  Grid/tests/grid_dist_id_unit_test.cpp
+							  Grid/tests/sgrid_dist_id_unit_tests.cpp
+							  Grid/tests/grid_dist_id_dlb_unit_test.cpp
+							  Grid/tests/staggered_grid_dist_unit_test.cpp
+							  Vector/tests/vector_dist_cell_list_tests.cpp
+							  Vector/tests/vector_dist_complex_prp_unit_test.cpp
+							  Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
+							  Vector/tests/vector_dist_MP_unit_tests.cpp
+							  Vector/tests/vector_dist_NN_tests.cpp
+							  Vector/tests/vector_dist_unit_test.cpp
+							  pdata_performance.cpp
+							  Decomposition/tests/CartDecomposition_unit_test.cpp
+							  Decomposition/tests/shift_vect_converter_tests.cpp
+							  Vector/performance/vector_dist_performance_util.cpp
+							  lib/pdata.cpp test_multiple_o.cpp)
+
+	add_library(ofpm_pdata STATIC lib/pdata.cpp)
+
 endif()
 
-add_executable(pdata ${OPENFPM_INIT_FILE} ${CUDA_SOURCES} main.cpp
-							   							  Debug/debug_test.cpp
-														  Amr/grid_dist_amr_unit_tests.cpp
-														  Amr/tests/amr_base_unit_tests.cpp
-														  Debug/debug_test.cpp
-														  Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
-														  Grid/tests/grid_dist_id_unit_test.cpp
-														  Grid/tests/sgrid_dist_id_unit_tests.cpp
-														  Grid/tests/grid_dist_id_dlb_unit_test.cpp
-														  Grid/tests/staggered_grid_dist_unit_test.cpp
-														  Vector/tests/vector_dist_cell_list_tests.cpp
-														  Vector/tests/vector_dist_complex_prp_unit_test.cpp
-														  Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp 
-														  Vector/tests/vector_dist_MP_unit_tests.cpp 
-														  Vector/tests/vector_dist_NN_tests.cpp 
-														  Vector/tests/vector_dist_unit_test.cpp  
-														  pdata_performance.cpp 
-														  Decomposition/tests/CartDecomposition_unit_test.cpp 
-														  Decomposition/tests/shift_vect_converter_tests.cpp 
-														  Vector/performance/vector_dist_performance_util.cpp  
-														  lib/pdata.cpp test_multiple_o.cpp  
-														  ../openfpm_devices/src/memory/HeapMemory.cpp 
-														  ../openfpm_devices/src/memory/PtrMemory.cpp 
-														  ../openfpm_vcluster/src/VCluster/VCluster.cpp 
-														  )
+add_dependencies(pdata ofpmmemory)
+add_dependencies(pdata vcluster)
+
+#add_executable(isolation_pdata ${OPENFPM_INIT_FILE} isolation.cu
+#														  lib/pdata.cpp
+#														  ../openfpm_devices/src/memory/HeapMemory.cpp
+#														  ../openfpm_devices/src/memory/CudaMemory.cu
+#														  ../openfpm_devices/src/memory/PtrMemory.cpp
+#														  ../openfpm_vcluster/src/VCluster/VCluster.cpp
+#														  )
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	target_compile_options(pdata PRIVATE "-Wno-undefined-var-template")
@@ -59,13 +121,17 @@ if ( CMAKE_COMPILER_IS_GNUCC )
     endif()
 endif()
 
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+        add_definitions(-D__STRICT_ANSI__)
+endif()
+
 if (ENABLE_ASAN)
     target_compile_options(pdata PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g" >)
     target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g >)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer")
+	add_definitions(-DENABLE_ASAN)
 endif()
 
-add_library(ofpm_pdata STATIC lib/pdata.cpp)
 
 add_test(NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata)
 add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata)
@@ -73,14 +139,10 @@ add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata)
 ###########################
 
 if (CUDA_FOUND)
-	target_compile_options(pdata PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
 	target_include_directories (pdata PUBLIC ${MPI_C_INCLUDE_DIRS})
         if (TEST_COVERAGE)
 		target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
         endif()
-	if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-		#		target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -G>)
-	endif()
 endif()
 
 if(TEST_PERFORMANCE)
@@ -88,22 +150,36 @@ if(TEST_PERFORMANCE)
 endif()
 target_include_directories (pdata PUBLIC ${PARMETIS_ROOT}/include)
 target_include_directories (pdata PUBLIC ${METIS_ROOT}/include)
+target_include_directories (pdata PUBLIC ${HDF5_ROOT}/include)
 target_include_directories (pdata PUBLIC ${CUDA_INCLUDE_DIRS})
-target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/)
-target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/)
-target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/)
-target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/)
-target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
+target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/)
+target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/)
+target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/)
+target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/)
+target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/config)
 target_include_directories (pdata PUBLIC ${PETSC_INCLUDES})
-target_include_directories (pdata PUBLIC ${HDF5_ROOT}/include)
 target_include_directories (pdata PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
-if(EIGEN3_FOUND)
-	target_include_directories (pdata PUBLIC ${EIGEN3_INCLUDE_DIR})
-endif()
-
-target_include_directories(pdata PUBLIC ${Vc_INCLUDE_DIR})
+target_include_directories (pdata PUBLIC ${ALPAKA_ROOT}/include)
+target_include_directories (pdata PUBLIC ${Vc_INCLUDE_DIR})
 target_include_directories (pdata PUBLIC ${Boost_INCLUDE_DIRS})
+target_include_directories (pdata PUBLIC ${MPI_C_INCLUDE_DIRS})
+
+#target_include_directories (isolation_pdata PUBLIC ${PARMETIS_ROOT}/include)
+#target_include_directories (isolation_pdata PUBLIC ${METIS_ROOT}/include)
+#target_include_directories (isolation_pdata PUBLIC ${CUDA_INCLUDE_DIRS})
+#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/)
+#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/)
+#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/)
+#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/)
+#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
+#target_include_directories (isolation_pdata PUBLIC ${PETSC_INCLUDES})
+#target_include_directories (isolation_pdata PUBLIC ${HDF5_ROOT}/include)
+#target_include_directories (isolation_pdata PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
+#target_include_directories (isolation_pdata PUBLIC ${Vc_INCLUDE_DIR})
+#target_include_directories (isolation_pdata PUBLIC ${Boost_INCLUDE_DIRS})
+
 
 target_link_libraries(pdata ${Boost_LIBRARIES})
 target_link_libraries(pdata ${PARMETIS_LIBRARIES})
@@ -112,6 +188,19 @@ target_link_libraries(pdata ${HDF5_LIBRARIES})
 target_link_libraries(pdata -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
 target_link_libraries(pdata ${PETSC_LIBRARIES})
 target_link_libraries(pdata ${Vc_LIBRARIES})
+target_link_libraries(pdata ${alpaka_LIBRARIES})
+target_link_libraries(pdata ${MPI_C_LIBRARIES})
+target_link_libraries(pdata ${MPI_CXX_LIBRARIES})
+target_link_libraries(pdata vcluster)
+target_link_libraries(pdata ofpmmemory)
+
+#target_link_libraries(isolation_pdata ${Boost_LIBRARIES})
+#target_link_libraries(isolation_pdata ${PARMETIS_LIBRARIES})
+#target_link_libraries(isolation_pdata -L${METIS_ROOT}/lib metis)
+#target_link_libraries(isolation_pdata ${HDF5_LIBRARIES})
+#target_link_libraries(isolation_pdata -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
+#target_link_libraries(isolation_pdata ${PETSC_LIBRARIES})
+#target_link_libraries(isolation_pdata ${Vc_LIBRARIES})
 
 if (TEST_PERFORMANCE)
 	target_link_libraries(pdata  ${Boost_FILESYSTEM_LIBRARY})
@@ -125,19 +214,13 @@ if (TEST_COVERAGE)
     target_link_libraries(pdata -lgcov --coverage)
 endif()
 
-# Debug!
-# Hack found at https://github.com/LLNL/scr/issues/130#issuecomment-402815952
-IF(MPI_CXX_FOUND)
-	INCLUDE_DIRECTORIES(${MPI_CXX_INCLUDE_PATH})
-#	LIST(APPEND SCR_EXTERNAL_LIBS ${MPI_CXX_LIBRARIES})
-	target_link_libraries(pdata ${MPI_CXX_LIBRARIES})
-ENDIF(MPI_CXX_FOUND)
+
 
 target_include_directories (ofpm_pdata PUBLIC ${CUDA_INCLUDE_DIRS})
-target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/)
-target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
-target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/)
+target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/)
+target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/config)
+target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/)
 target_include_directories (ofpm_pdata PUBLIC ${Boost_INCLUDE_DIRS})
 
 target_compile_definitions(pdata PRIVATE ${MPI_VENDOR})
@@ -151,7 +234,7 @@ endif()
 # Request that particles be built with -std=c++11
 # As this is a public compile feature anything that links to particles
 # will also build with -std=c++11
-target_compile_features(pdata PUBLIC cxx_std_11)
+target_compile_features(pdata PUBLIC cxx_std_14)
 target_link_libraries(pdata ${MPI_C_LIBRARIES})
 target_link_libraries(pdata m)
 target_link_libraries(pdata c)
@@ -159,6 +242,7 @@ if (NOT APPLE)
     target_link_libraries(pdata rt)
 endif ()
 
+
 install(FILES Decomposition/CartDecomposition.hpp
        	      Decomposition/Domain_icells_cart.hpp	
 	      Decomposition/shift_vect_converter.hpp 
@@ -170,7 +254,8 @@ install(FILES Decomposition/CartDecomposition.hpp
 	      Decomposition/nn_processor.hpp Decomposition/ie_loc_ghost.hpp 
 	      Decomposition/ORB.hpp
 	      Decomposition/dec_optimizer.hpp
-	      DESTINATION openfpm_pdata/include/Decomposition/ )
+	      DESTINATION openfpm_pdata/include/Decomposition/
+	      COMPONENT OpenFPM)
 
 install(FILES Decomposition/Distribution/metis_util.hpp 
 	      Decomposition/Distribution/SpaceDistribution.hpp 
@@ -178,13 +263,15 @@ install(FILES Decomposition/Distribution/metis_util.hpp
 	      Decomposition/Distribution/parmetis_util.hpp 
 	      Decomposition/Distribution/MetisDistribution.hpp 
 	      Decomposition/Distribution/ParMetisDistribution.hpp 
-	      Decomposition/Distribution/DistParMetisDistribution.hpp  
-	      DESTINATION openfpm_pdata/include/Decomposition/Distribution )
+	      Decomposition/Distribution/DistParMetisDistribution.hpp
+	      Decomposition/Distribution/BoxDistribution.hpp
+	      DESTINATION openfpm_pdata/include/Decomposition/Distribution
+	      COMPONENT OpenFPM)
 
 install(FILES Decomposition/cuda/ie_ghost_gpu.cuh
 	      Decomposition/cuda/CartDecomposition_gpu.cuh
-	      DESTINATION openfpm_pdata/include/Decomposition/cuda )
-
+	      DESTINATION openfpm_pdata/include/Decomposition/cuda
+	      COMPONENT OpenFPM)
 
 install(FILES Grid/grid_dist_id.hpp 
 	      Grid/grid_dist_id_comm.hpp
@@ -193,23 +280,27 @@ install(FILES Grid/grid_dist_id.hpp
 	      Grid/staggered_dist_grid.hpp 
 	      Grid/staggered_dist_grid_util.hpp 
 	      Grid/staggered_dist_grid_copy.hpp
-	      DESTINATION openfpm_pdata/include/Grid/ )
+	      DESTINATION openfpm_pdata/include/Grid/
+	      COMPONENT OpenFPM)
 
 install(FILES Grid/cuda/grid_dist_id_kernels.cuh
 	      Grid/cuda/grid_dist_id_iterator_gpu.cuh
-	DESTINATION openfpm_pdata/include/Grid/cuda/ )
+	DESTINATION openfpm_pdata/include/Grid/cuda/
+	COMPONENT OpenFPM)
 
 install(FILES Amr/grid_dist_amr_key_iterator.hpp 
 	      Amr/grid_dist_amr_key.hpp
 	      Amr/grid_dist_amr.hpp
-	      DESTINATION openfpm_pdata/include/Amr/ )
+	      DESTINATION openfpm_pdata/include/Amr/
+	      COMPONENT OpenFPM)
 
 install(FILES Grid/Iterators/grid_dist_id_iterator_util.hpp
               Grid/Iterators/grid_dist_id_iterator_dec.hpp
               Grid/Iterators/grid_dist_id_iterator_dec_skin.hpp
               Grid/Iterators/grid_dist_id_iterator_sub.hpp
 	      Grid/Iterators/grid_dist_id_iterator.hpp
-	      DESTINATION openfpm_pdata/include/Grid/Iterators )
+	      DESTINATION openfpm_pdata/include/Grid/Iterators
+	      COMPONENT OpenFPM)
 
 
 install(FILES Vector/se_class3_vector.hpp  
@@ -219,43 +310,134 @@ install(FILES Vector/se_class3_vector.hpp
 	      Vector/vector_dist_key.hpp
 	      Vector/vector_dist_kernel.hpp
 		  Vector/vector_dist_subset.hpp
-		DESTINATION openfpm_pdata/include/Vector )
+		DESTINATION openfpm_pdata/include/Vector
+		COMPONENT OpenFPM)
 
 install(FILES util/common_pdata.hpp
-	      DESTINATION openfpm_pdata/include/util)
+	      DESTINATION openfpm_pdata/include/util
+	      COMPONENT OpenFPM)
 
 install(FILES Vector/Iterators/vector_dist_iterator.hpp
-	      DESTINATION openfpm_pdata/include/Vector/Iterators/ )
+	      DESTINATION openfpm_pdata/include/Vector/Iterators/
+	      COMPONENT OpenFPM)
 
 install(FILES Vector/util/vector_dist_funcs.hpp
-	      DESTINATION openfpm_pdata/include/Vector/util )
+	      DESTINATION openfpm_pdata/include/Vector/util
+	      COMPONENT OpenFPM)
 
 install(FILES Vector/cuda/vector_dist_comm_util_funcs.cuh
 	      Vector/cuda/vector_dist_cuda_funcs.cuh
 	      Vector/cuda/vector_dist_operators_list_ker.hpp
-	DESTINATION openfpm_pdata/include/Vector/cuda )
+	DESTINATION openfpm_pdata/include/Vector/cuda
+	COMPONENT OpenFPM)
 
 install(FILES Graph/ids.hpp Graph/dist_map_graph.hpp 
 	      Graph/DistGraphFactory.hpp
-              DESTINATION openfpm_pdata/include/Graph )
+              DESTINATION openfpm_pdata/include/Graph
+	      COMPONENT OpenFPM)
 
 install(FILES example.mk
 	      SubdomainGraphNodes.hpp
               DESTINATION openfpm_pdata/include/ )
 
 install(FILES DLB/DLB.hpp DLB/LB_Model.hpp
-	DESTINATION openfpm_pdata/include/DLB )
+	DESTINATION openfpm_pdata/include/DLB
+	COMPONENT OpenFPM)
 
 install(FILES config/config.h
-        DESTINATION openfpm_pdata/include/config )
+        DESTINATION openfpm_pdata/include/config
+	COMPONENT OpenFPM)
 
 install(FILES lib/pdata.hpp
-        DESTINATION openfpm_pdata/include/lib )
+        DESTINATION openfpm_pdata/include/lib
+	COMPONENT OpenFPM)
 
 install(FILES Debug/debug.hpp
-	DESTINATION openfpm_pdata/include/Debug )
+	DESTINATION openfpm_pdata/include/Debug
+	COMPONENT OpenFPM)
+
+install(TARGETS ofpm_pdata EXPORT ofpm_pdata_config  DESTINATION openfpm_pdata/lib COMPONENT OpenFPM)
+
+########## Create openfpmConfig.cmake + openfpmConfigVersion.cmake
+
+add_library(binary_config INTERFACE)
+
+target_include_directories(
+  binary_config
+  INTERFACE
+  ${CMAKE_INSTALL_PREFIX}/openfpm_pdata/include
+  ${CMAKE_INSTALL_PREFIX}/openfpm_data/include
+  ${CMAKE_INSTALL_PREFIX}/openfpm_pdata/include/config
+  ${CMAKE_INSTALL_PREFIX}/openfpm_io/include
+  ${CMAKE_INSTALL_PREFIX}/openfpm_vcluster/include
+  ${CMAKE_INSTALL_PREFIX}/openfpm_devices/include
+  ${CMAKE_INSTALL_PREFIX}/openfpm_numerics/include
+  ${PARMETIS_ROOT}/include
+  ${METIS_ROOT}/include
+  ${CUDA_INCLUDE_DIRS}
+  ${PETSC_INCLUDES}
+  ${HDF5_ROOT}/include
+  ${LIBHILBERT_INCLUDE_DIRS}
+  ${Vc_INCLUDE_DIR}
+  ${Boost_INCLUDE_DIRS}
+  )
+
+if (CUDA_ON_CPU)
+	target_compile_definitions(binary_config INTERFACE CUDA_ON_CPU)
+endif()
 
-install(TARGETS ofpm_pdata DESTINATION openfpm_pdata/lib)
+target_compile_options(binary_config INTERFACE $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
+
+target_link_libraries(binary_config INTERFACE ${Boost_LIBRARIES})
+target_link_libraries(binary_config INTERFACE ${PARMETIS_LIBRARIES})
+target_link_libraries(binary_config INTERFACE -L${METIS_ROOT}/lib metis)
+target_link_libraries(binary_config INTERFACE ${HDF5_LIBRARIES})
+target_link_libraries(binary_config INTERFACE -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
+target_link_libraries(binary_config INTERFACE ${PETSC_LIBRARIES})
+target_link_libraries(binary_config INTERFACE ${Vc_LIBRARIES})
+target_link_libraries(binary_config INTERFACE ${alpaka_LIBRARIES})
+target_link_libraries(binary_config INTERFACE ${MPI_C_LIBRARIES})
+
+# Not OK before CMake 3.13
+#target_link_libraries(binary_config INTERFACE $<INSTALL_PREFIX>/openfpm_vcluster/lib/$<TARGET_FILE_NAME:openfpm::vcluster> )
+#target_link_libraries(binary_config INTERFACE $<INSTALL_PREFIX>/openfpm_devices/lib/$<TARGET_FILE_NAME:openfpm::ofpmmemory> )
+target_link_libraries(binary_config INTERFACE ${CMAKE_INSTALL_PREFIX}/openfpm_vcluster/lib/libvcluster.a )
+target_link_libraries(binary_config INTERFACE ${CMAKE_INSTALL_PREFIX}/openfpm_devices/lib/libofpmmemory.a )
+target_link_libraries(binary_config INTERFACE ${CMAKE_INSTALL_PREFIX}/openfpm_pdata/lib/libofpm_pdata.a )
+target_link_libraries(binary_config INTERFACE ${CUDA_LIBRARIES} )
+
+# Does not work before Cmake 3.13
+#install(TARGETS binary_config vcluster ofpmmemory EXPORT openfpm_config  CONFIGURATIONS)
+install(TARGETS binary_config EXPORT openfpm_config  CONFIGURATIONS)
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  "${CMAKE_CURRENT_BINARY_DIR}/openfpm_cmake/openfpmConfigVersion.cmake"
+  VERSION ${openfpm_VERSION}
+  COMPATIBILITY AnyNewerVersion
+)
+
+
+install(EXPORT openfpm_config
+        DESTINATION cmake/
+        NAMESPACE openfpm::
+        FILE openfpmConfig.cmake)
+
+install(
+  FILES
+    "${CMAKE_CURRENT_BINARY_DIR}/openfpm_cmake/openfpmConfigVersion.cmake"
+  DESTINATION
+    cmake/
+)
+
+install(
+  FILES
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake/openfpmConfigVars-configure.cmake"
+  DESTINATION
+    cmake/
+)
+
+#####################################################################
 
 #if(BUILD_TESTING)
 
diff --git a/src/DLB/DLB.hpp b/src/DLB/DLB.hpp
index fa04bc538957d865c13ed23ce73f66878917d193..cb72e0c203b5185d6a13985497284228af983b2d 100644
--- a/src/DLB/DLB.hpp
+++ b/src/DLB/DLB.hpp
@@ -9,7 +9,7 @@
 #define SRC_DECOMPOSITION_DLB_HPP_
 
 //! Time structure for statistical purposes
-typedef struct
+struct Times
 {
 	//! starting time of the simulation (0)
 	size_t simulationStartTime = 0;
@@ -27,7 +27,7 @@ typedef struct
 
 	//! End time
 	size_t iterationEndTime;
-} Times;
+};
 
 /*! Class that implements the two heuristics to determine when a re-balance of the distribution is needed.
  *
diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp
index c43d2a416bfce37f31d624f08443fdec2e59fcc5..7c52034ff6e7b6af27fa65cd5be945b8938d8e04 100755
--- a/src/Decomposition/CartDecomposition.hpp
+++ b/src/Decomposition/CartDecomposition.hpp
@@ -43,6 +43,12 @@
 
 #define CARTDEC_ERROR 2000lu
 
+enum dec_options
+{
+	DEC_NONE = 0,
+	DEC_SKIP_ICELL = 1
+};
+
 /*! \brief It spread the sub-sub-domain on a regular cartesian grid of size dim
  *
  * \warning this function only guarantee that the division on each direction is
@@ -163,16 +169,15 @@ protected:
 	//! acc_key is size_t
 	typedef typename openfpm::vector<SpaceBox<dim, T>,
 			Memory,
-			typename memory_traits_lin<SpaceBox<dim, T>>::type,
 			memory_traits_lin,
 			openfpm::vector_grow_policy_default,
 			openfpm::vect_isel<SpaceBox<dim, T>>::value>::access_key acc_key;
 
 	//! the set of all local sub-domain as vector
-	openfpm::vector<SpaceBox<dim, T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> sub_domains;
+	openfpm::vector<SpaceBox<dim, T>,Memory,layout_base> sub_domains;
 
 	//! the remote set of all sub-domains as vector of 'sub_domains' vectors
-	mutable openfpm::vector<Box_map<dim, T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> sub_domains_global;
+	mutable openfpm::vector<Box_map<dim, T>,Memory,layout_base> sub_domains_global;
 
 	//! for each sub-domain, contain the list of the neighborhood processors
 	openfpm::vector<openfpm::vector<long unsigned int> > box_nn_processor;
@@ -277,14 +282,14 @@ protected:
 		return sub_d;
 	}
 
-	void collect_all_sub_domains(openfpm::vector<Box_map<dim,T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> & sub_domains_global)
+	void collect_all_sub_domains(openfpm::vector<Box_map<dim,T>,Memory,layout_base> & sub_domains_global)
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
 #endif
 
 		sub_domains_global.clear();
-		openfpm::vector<Box_map<dim,T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> bm;
+		openfpm::vector<Box_map<dim,T>,Memory,layout_base> bm;
 
 		for (size_t i = 0 ; i < sub_domains.size() ; i++)
 		{
@@ -1345,7 +1350,7 @@ public:
 	/*! \brief Start decomposition
 	 *
 	 */
-	void decompose()
+	void decompose(dec_options opt = dec_options::DEC_NONE)
 	{
 		reset();
 
@@ -1361,13 +1366,17 @@ public:
 		domain_nn_calculator_cart<dim>::reset();
 		domain_nn_calculator_cart<dim>::setParameters(proc_box);
 
-		domain_icell_calculator<dim,T,layout_base,Memory>
-		::CalculateInternalCells(v_cl,
+		if (opt != dec_options::DEC_SKIP_ICELL)
+		{
+
+			domain_icell_calculator<dim,T,layout_base,Memory>
+			::CalculateInternalCells(v_cl,
 								 ie_ghost<dim, T,Memory,layout_base>::private_get_vb_int_box(),
 								 sub_domains,
 								 this->getProcessorBounds(),
 								 this->getGhost().getRcut(),
 								 this->getGhost());
+		}
 	}
 
 	/*! \brief Refine the decomposition, available only for ParMetis distribution, for Metis it is a null call
@@ -1577,7 +1586,7 @@ public:
 		return domain;
 	}
 
-	const openfpm::vector<SpaceBox<dim, T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> &
+	const openfpm::vector<SpaceBox<dim, T>,Memory,layout_base> &
 	getSubDomains() const
 	{
 		return sub_domains;
@@ -1826,7 +1835,7 @@ public:
 	bool write(std::string output) const
 	{
 		//! subdomains_X.vtk domain for the local processor (X) as union of sub-domain
-		VTKWriter<openfpm::vector<SpaceBox<dim, T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base>, VECTOR_BOX> vtk_box1;
+		VTKWriter<openfpm::vector<SpaceBox<dim, T>,Memory,layout_base>, VECTOR_BOX> vtk_box1;
 		vtk_box1.add(sub_domains);
 		vtk_box1.write(output + std::string("subdomains_") + std::to_string(v_cl.getProcessUnitID()) + std::string(".vtk"));
 
@@ -2175,7 +2184,7 @@ public:
 	 * \return sub_domains_global
 	 *
 	 */
-	openfpm::vector<Box_map<dim, T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> & private_get_sub_domains_global()
+	openfpm::vector<Box_map<dim, T>,Memory,layout_base> & private_get_sub_domains_global()
 	{
 		return sub_domains_global;
 	}
diff --git a/src/Decomposition/Distribution/BoxDistribution.hpp b/src/Decomposition/Distribution/BoxDistribution.hpp
index a1a2265270928caaf1a1bfcec404683bead1eec6..13535c5bade04a2c27d101847ed81ad5e79501b0 100644
--- a/src/Decomposition/Distribution/BoxDistribution.hpp
+++ b/src/Decomposition/Distribution/BoxDistribution.hpp
@@ -228,7 +228,8 @@ public:
                 getPrimeFactors(v_cl.size(),facts);
 
                 size_t div[dim];
-		size_t ln[dim];
+				size_t ln[dim];
+				double ln_d[dim];
 
                 for (int i = 0 ; i < dim ; i++)
                 {div[i] = 1;}
@@ -237,7 +238,10 @@ public:
                 {div[i % dim] *= facts.get(i);}
 
                 for (int i = 0 ; i < dim ; i++)
-                {ln[i] = gr.size(i) / div[i];}
+                {
+					ln[i] = gr.size(i) / div[i];
+					ln_d[i] = (double)gr.size(i) / div[i];
+				}
 
                 grid_sm<dim,void> gr_proc(div);
 
@@ -249,10 +253,10 @@ public:
 
                     for (int i = 0 ; i < dim ; i++)
                     {
-			key_prc.set_d(i,key.get(i)/ln[i]);
-			if (key_prc.get(i) >= div[i])
-			{key_prc.set_d(i,div[i]-1);}
-		    }
+						key_prc.set_d(i,key.get(i)/ln_d[i]);
+						if (key_prc.get(i) >= div[i])
+						{key_prc.set_d(i,div[i]-1);}
+		    		}
 
                     size_t i = gr.LinId(key);
 
diff --git a/src/Decomposition/Distribution/Distribution_unit_tests.hpp b/src/Decomposition/Distribution/Distribution_unit_tests.hpp
index ac8d35ea74113e2f656affd7b8e39020c99a48c3..4836918c9e2c5a6faed7ad14dfec9fb7ec9d6728 100644
--- a/src/Decomposition/Distribution/Distribution_unit_tests.hpp
+++ b/src/Decomposition/Distribution/Distribution_unit_tests.hpp
@@ -11,6 +11,7 @@
 #include "config.h"
 #include "SpaceDistribution.hpp"
 #include <unistd.h>
+#include "BoxDistribution.hpp"
 
 /*! \brief Set a sphere as high computation cost
  *
@@ -423,6 +424,53 @@ BOOST_AUTO_TEST_CASE( Space_distribution_test)
 }
 
 
+BOOST_AUTO_TEST_CASE( Box_distribution_test)
+{
+	Vcluster<> & v_cl = create_vcluster();
+
+	if (v_cl.size() > 16)
+	{return;}
+
+	//! [Initialize a ParMetis Cartesian graph and decompose]
+
+	BoxDistribution<3, float> box_dist(v_cl);
+
+	// Physical domain
+	Box<3, float> box( { 0.0, 0.0, 0.0 }, { 10.0, 10.0, 10.0 });
+
+	// Grid info
+	grid_sm<3, void> info( { GS_SIZE, GS_SIZE, GS_SIZE });
+
+	// Initialize Cart graph and decompose
+	box_dist.createCartGraph(info,box);
+
+	// First create the center of the weights distribution, check it is coherent to the size of the domain
+	Point<3, float> center( { 2.0, 2.0, 2.0 });
+
+	// first decomposition
+	box_dist.decompose();
+
+	BOOST_REQUIRE_EQUAL(box_dist.get_ndec(),0ul);
+
+	auto & graph = box_dist.getGraph();
+
+	for (int i = 0 ; i < graph.getNVertex() ; i++)
+	{
+		BOOST_REQUIRE(graph.vertex(i).template get<nm_v_proc_id>() < v_cl.size());
+	}
+
+	size_t n_sub = box_dist.getNOwnerSubSubDomains();
+
+	size_t n_sub_tot = info.size();
+	size_t n_sub_bal = n_sub_tot / v_cl.size();
+
+	BOOST_REQUIRE( (((int)n_sub_bal - 64) <= (long int)n_sub) && (n_sub_bal + 64 >= n_sub) );
+
+	//! [refine with parmetis the decomposition]
+
+//	BOOST_REQUIRE_EQUAL(sizeof(ParMetisDistribution<3,float>),872ul);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif /* SRC_DECOMPOSITION_DISTRIBUTION_DISTRIBUTION_UNIT_TESTS_HPP_ */
diff --git a/src/Decomposition/Distribution/metis_util.hpp b/src/Decomposition/Distribution/metis_util.hpp
index 92ceb7a9218f4fece6775988679c40ac7576c097..fd7d8273757bad531c28ed58f06a8b6abc468187 100644
--- a/src/Decomposition/Distribution/metis_util.hpp
+++ b/src/Decomposition/Distribution/metis_util.hpp
@@ -133,6 +133,16 @@ class Metis
 	void constructAdjListWithWeights(Graph & g)
 	{
 		// create xadj, adjlist, vwgt, adjwgt and vsize
+		if (Mg.xadj != NULL)
+		{delete [] Mg.xadj;}
+		if (Mg.adjncy != NULL)
+		{delete [] Mg.adjncy;}
+		if (Mg.vwgt != NULL)
+		{delete [] Mg.vwgt;}
+		if (Mg.adjwgt != NULL)
+		{delete [] Mg.adjwgt;}
+		if (Mg.vsize != NULL)
+		{delete [] Mg.vsize;}
 		Mg.xadj = new idx_t[g.getNVertex() + 1];
 		Mg.adjncy = new idx_t[g.getNEdge()];
 		Mg.vwgt = new idx_t[g.getNVertex()];
@@ -387,6 +397,10 @@ public:
 		{
 			delete[] Mg.part;
 		}
+        if (Mg.vsize != NULL)
+        {
+            delete[] Mg.vsize;
+        }
 	}
 
 	/*! \brief Decompose the graph
diff --git a/src/Decomposition/Domain_icells_cart.hpp b/src/Decomposition/Domain_icells_cart.hpp
index acd9837e0392c07d0fe94a4f6e144978ab1e1a9d..e2aa69184795b0141a37b7484a64efe9143553af 100644
--- a/src/Decomposition/Domain_icells_cart.hpp
+++ b/src/Decomposition/Domain_icells_cart.hpp
@@ -26,16 +26,20 @@ __global__ void insert_icell(vector_sparse_type vs, CellDecomposer_type cld, gri
 
 	unsigned int b = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
 
+	bool out = false;
 	for (unsigned int i = 0 ; i < dim ; i++)
 	{
 		gk.set_d(i,gk.get(i) + start.get(i));
 		if (gk.get(i) > stop.get(i))
-		{return;}
+		{out = true;}
 	}
 
-	auto id = cld.LinId(gk);
+	if (out == false)
+	{
+		auto id = cld.LinId(gk);
 
-	vs.insert_b(id,b);
+		vs.insert_b(id,b);
+	}
 
 	vs.flush_block_insert(b, threadIdx.x == 0 & threadIdx.y == 0 & threadIdx.z == 0 );
 }
@@ -50,17 +54,21 @@ __global__ void insert_remove_icell(vector_sparse_type vs, vector_sparse_type vs
 
 	unsigned int b = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
 
+	bool out = false;
 	for (unsigned int i = 0 ; i < dim ; i++)
 	{
 		gk.set_d(i,gk.get(i) + start.get(i));
 		if (gk.get(i) > stop.get(i))
-		{return;}
+		{out = true;}
 	}
 
-	auto id = cld.LinId(gk);
+	if (out == false)
+	{
+		auto id = cld.LinId(gk);
 
-	vs.insert_b(id,b);
-	vsi.remove_b(id,b);
+		vs.insert_b(id,b);
+		vsi.remove_b(id,b);
+	}
 
 	vs.flush_block_insert(b, threadIdx.x == 0 & threadIdx.y == 0 & threadIdx.z == 0 );
 	vsi.flush_block_remove(b, threadIdx.x == 0 & threadIdx.y == 0 & threadIdx.z == 0);
@@ -71,14 +79,14 @@ struct CalculateInternalCells_impl
 {
 	template<typename VCluster_type>
 	static void CalculateInternalCells(VCluster_type & v_cl,
-			openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & ig_box,
-			openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim,T>>::type,layout_base> & domain,
+			openfpm::vector<Box<dim,T>,Memory,layout_base> & ig_box,
+			openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & domain,
 			Box<dim,T> & pbox,
 			T r_cut,
 			const Ghost<dim,T> & enlarge,
 			CellDecomposer_sm<dim,T,shift<dim,T>> & cd,
-			openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & icells,
-			openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & dcells)
+			openfpm::vector<aggregate<ids_type>,Memory,layout_base> & icells,
+			openfpm::vector<aggregate<ids_type>,Memory,layout_base> & dcells)
 	{
 
 	}
@@ -89,16 +97,16 @@ struct CalculateInternalCells_impl<dim,T,layout_base,Memory,cnt_type,ids_type,tr
 {
 	template<typename VCluster_type>
 	static void CalculateInternalCells(VCluster_type & v_cl,
-			openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & ig_box,
-			openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim,T>>::type,layout_base> & domain,
+			openfpm::vector<Box<dim,T>,Memory,layout_base> & ig_box,
+			openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & domain,
 			Box<dim,T> & pbox,
 			T r_cut,
 			const Ghost<dim,T> & enlarge,
 			CellDecomposer_sm<dim,T,shift<dim,T>> & cd,
-			openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & icells,
-			openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & dcells)
+			openfpm::vector<aggregate<ids_type>,Memory,layout_base> & icells,
+			openfpm::vector<aggregate<ids_type>,Memory,layout_base> & dcells)
 	{
-#ifdef __NVCC__
+#if 0
 
 		// Division array
 		size_t div[dim];
@@ -186,6 +194,7 @@ struct CalculateInternalCells_impl<dim,T,layout_base,Memory,cnt_type,ids_type,tr
 			vsi.flush_remove(v_cl.getmgpuContext(),flush_type::FLUSH_ON_DEVICE);
 		}
 
+
 		vs.swapIndexVector(icells);
 		vsi.swapIndexVector(dcells);
 
@@ -202,8 +211,8 @@ class domain_icell_calculator
 
 	typedef int ids_type;
 
-	openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> icells;
-	openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> dcells;
+	openfpm::vector<aggregate<ids_type>,Memory,layout_base> icells;
+	openfpm::vector<aggregate<ids_type>,Memory,layout_base> dcells;
 
 	CellDecomposer_sm<dim,T,shift<dim,T>> cd;
 
@@ -246,8 +255,8 @@ class domain_icell_calculator
 	 */
 	template<typename VCluster_type>
 	void CalculateInternalCells(VCluster_type & v_cl,
-								openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & ig_box,
-								openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim,T>>::type,layout_base> & domain,
+								openfpm::vector<Box<dim,T>,Memory,layout_base> & ig_box,
+								openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & domain,
 								Box<dim,T> & pbox,
 								T r_cut,
 								const Ghost<dim,T> & enlarge)
@@ -262,7 +271,7 @@ class domain_icell_calculator
 	 * \return the list of the internal cells
 	 *
 	 */
-	openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & getIcells()
+	openfpm::vector<aggregate<ids_type>,Memory,layout_base> & getIcells()
 	{
 		return icells;
 	}
@@ -272,7 +281,7 @@ class domain_icell_calculator
 	 * \return the list of the internal cells
 	 *
 	 */
-	openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & getDcells()
+	openfpm::vector<aggregate<ids_type>,Memory,layout_base> & getDcells()
 	{
 		return dcells;
 	}
diff --git a/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu b/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu
index 5fc8f5db1acd5802bf84beb84b8e76db822c0010..83052d394b248309290cf3a1bd39a98fb4a6bdd4 100644
--- a/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu
+++ b/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu
@@ -11,6 +11,8 @@ BOOST_AUTO_TEST_SUITE( domain_icells_cart )
 
 BOOST_AUTO_TEST_CASE( domain_icells_use )
 {
+#if 0
+
 	domain_icell_calculator<3,float,memory_traits_inte,CudaMemory> dcc;
 
 	openfpm::vector_gpu<SpaceBox<3,float>> domain_proc;
@@ -133,6 +135,8 @@ BOOST_AUTO_TEST_CASE( domain_icells_use )
 
 	for (size_t i = 0 ; i < dcheck.size() ; i++)
 	{BOOST_REQUIRE_EQUAL(dcheck.template get<0>(i),dc.template get<0>(i));}
+
+	#endif
 }
 
 
diff --git a/src/Decomposition/cuda/decomposition_cuda_tests.cu b/src/Decomposition/cuda/decomposition_cuda_tests.cu
index e578fe633474de975b30b6c54963cabe3edb68c9..a35dc741d71bab4b61f82831c1f3d41ec9b9298c 100644
--- a/src/Decomposition/cuda/decomposition_cuda_tests.cu
+++ b/src/Decomposition/cuda/decomposition_cuda_tests.cu
@@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 			CudaMemory mem;
 			mem.allocate(2*sizeof(unsigned int));
 
-			test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
+			CUDA_LAUNCH_DIM3((test_proc_idbc<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
 
 			mem.deviceToHost();
 
@@ -85,7 +85,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 
 			CudaMemory mem2;
 			mem2.allocate(2*sizeof(unsigned int));
-			test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
+			CUDA_LAUNCH_DIM3((test_ghost_n<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
 
 			mem2.deviceToHost();
 
@@ -93,7 +93,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 
 			openfpm::vector_gpu<aggregate<int,int>> vd;
 			vd.resize(tot);
-			test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
+			CUDA_LAUNCH_DIM3((test_ghost<decltype(gpudec),decltype(vd.toKernel())>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
 
 			if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
 			{
@@ -119,7 +119,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 
 			p2.get(j) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(j),1.0);
 
-			test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
+			CUDA_LAUNCH_DIM3((test_proc_idbc<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
 
 			mem.deviceToHost();
 
@@ -127,14 +127,14 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 			BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
 
 			mem2.allocate(2*sizeof(unsigned int));
-			test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
+			CUDA_LAUNCH_DIM3((test_ghost_n<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
 
 			mem2.deviceToHost();
 
 			tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
 
 			vd.resize(tot);
-			test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
+			CUDA_LAUNCH_DIM3((test_ghost<decltype(gpudec),decltype(vd.toKernel())>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
 
 			if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
 			{
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index cca146970240d20b3733fa44ed218103587b1903..2eadea6bad6427ae702d8ffadb48510c5ccf9a6d 100755
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -59,18 +59,18 @@ class ie_ghost
 	openfpm::vector<p_box<dim,T> > vb_ext;
 
 	//! Internal ghost boxes for this processor domain
-	openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int,unsigned int>>::type,layout_base> vb_int;
+	openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,layout_base> vb_int;
 
 	//! Internal ghost boxes for this processor domain
-	openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> vb_int_box;
+	openfpm::vector<Box<dim,T>,Memory,layout_base> vb_int_box;
 
 	//! Cell-list that store the geometrical information of the internal ghost boxes
 	CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> geo_cell;
 
-	typedef openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> proc_boxes;
+	typedef openfpm::vector<Box<dim,T>,Memory,layout_base> proc_boxes;
 
 	//! shift vectors
-	openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> shifts;
+	openfpm::vector<Point<dim,T>,Memory,layout_base> shifts;
 
 	//! Temporal buffers to return temporal information for ghost_processorID
 	openfpm::vector<std::pair<size_t,size_t>> ids_p;
@@ -245,7 +245,7 @@ protected:
 	 */
 	void create_box_nn_processor_ext(Vcluster<> & v_cl,
 			                         Ghost<dim,T> & ghost,
-									 openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains,
+									 openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains,
 									 const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor,
 									 const nn_prcs<dim,T,layout_base,Memory> & nn_p)
 	{
@@ -347,7 +347,7 @@ protected:
 	 */
 	void create_box_nn_processor_int(Vcluster<> & v_cl,
 			                         Ghost<dim,T> & ghost,
-									 openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains,
+									 openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains,
 									 const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor,
 									 const nn_prcs<dim,T,layout_base,Memory> & nn_p)
 	{
@@ -659,7 +659,7 @@ public:
 	 * \return the shift vectors
 	 *
 	 */
-	const openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & getShiftVectors()
+	const openfpm::vector<Point<dim,T>,Memory,layout_base> & getShiftVectors()
 	{
 		if (host_dev_transfer == false)
 		{
@@ -1302,7 +1302,7 @@ public:
 	 * \return vb_int
 	 *
 	 */
-	inline openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int,unsigned int>>::type,layout_base> &
+	inline openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,layout_base> &
 	private_get_vb_int()
 	{
 		return vb_int;
@@ -1313,7 +1313,7 @@ public:
 	 * \return vb_int_box
 	 *
 	 */
-	inline openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> &
+	inline openfpm::vector<Box<dim,T>,Memory,layout_base> &
 	private_get_vb_int_box()
 	{
 		return vb_int_box;
@@ -1335,7 +1335,7 @@ public:
 	 * \return shifts
 	 *
 	 */
-	inline openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> &
+	inline openfpm::vector<Point<dim,T>,Memory,layout_base> &
 	private_get_shifts()
 	{
 		return shifts;
diff --git a/src/Decomposition/ie_loc_ghost.hpp b/src/Decomposition/ie_loc_ghost.hpp
index 2e93329ceabc4c6d30e494df95affeb2a98fe2b9..231828e1e71503d525a371be2bb14760799f134e 100755
--- a/src/Decomposition/ie_loc_ghost.hpp
+++ b/src/Decomposition/ie_loc_ghost.hpp
@@ -40,7 +40,7 @@ class ie_loc_ghost
 	 *
 	 */
 	void create_loc_ghost_ebox(Ghost<dim,T> & ghost,
-			                   openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains,
+			                   openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains,
 							   openfpm::vector<Box_loc_sub<dim,T>> & sub_domains_prc)
 	{
 		comb<dim> zero;
@@ -101,7 +101,7 @@ class ie_loc_ghost
 	 *
 	 */
 	void create_loc_ghost_ibox(Ghost<dim,T> & ghost,
-			                   openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains,
+			                   openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains,
 							   openfpm::vector<Box_loc_sub<dim,T>> & sub_domains_prc)
 	{
 		comb<dim> zero;
@@ -249,7 +249,7 @@ public:
 	 * \param bc Boundary conditions
 	 *
 	 */
-	void create(openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains, Box<dim,T> & domain , Ghost<dim,T> & ghost , const size_t (&bc)[dim] )
+	void create(openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains, Box<dim,T> & domain , Ghost<dim,T> & ghost , const size_t (&bc)[dim] )
 	{
 		// It will store local sub-domains + borders
 		openfpm::vector<Box_loc_sub<dim,T>> sub_domains_prc;
diff --git a/src/Decomposition/nn_processor.hpp b/src/Decomposition/nn_processor.hpp
index eaf799e6f75928094253e5dd35a0d4fcc6737c45..9cf4f4e46b98bf448f5faed8c593c73404f0caed 100755
--- a/src/Decomposition/nn_processor.hpp
+++ b/src/Decomposition/nn_processor.hpp
@@ -431,7 +431,7 @@ public:
 	 *
 	 */
 	void create(const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor,
-			    const openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains)
+			    const openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains)
 	{
 		// produce the list of the adjacent processor (nn_processors) list
 		for (size_t i = 0 ;  i < box_nn_processor.size() ; i++)
diff --git a/src/Decomposition/shift_vect_converter.hpp b/src/Decomposition/shift_vect_converter.hpp
index f640bbd5fd38f12db84e9efca4a7f5dd0b2b9099..8957e2674812c446673c94b2252fb91775729cc7 100644
--- a/src/Decomposition/shift_vect_converter.hpp
+++ b/src/Decomposition/shift_vect_converter.hpp
@@ -34,7 +34,7 @@ class shift_vect_converter
 	 *
 	 */
 	void generateShiftVectors_ld(const Box<dim,T> & domain, size_t (& bc)[dim],
-			                     openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & shifts)
+			                     openfpm::vector<Point<dim,T>,Memory,layout_base> & shifts)
 	{
 		shifts.resize(openfpm::math::pow(3,dim));
 
@@ -71,7 +71,7 @@ class shift_vect_converter
 	 *
 	 */
 	void generateShiftVectors_hd(const Box<dim,T> & domain, size_t (& bc)[dim],
-			                     openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & shifts)
+			                     openfpm::vector<Point<dim,T>,Memory,layout_base> & shifts)
 	{
 		// get the indexes of the free degree of freedom
 		for (size_t i = 0 ; i < dim ; i++)
@@ -126,7 +126,7 @@ public:
 	 *
 	 */
 	void generateShiftVectors(const Box<dim,T> & domain, size_t (& bc)[dim],
-			                  openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & shifts)
+			                  openfpm::vector<Point<dim,T>,Memory,layout_base> & shifts)
 	{
 		if (dim < 10)
 		{generateShiftVectors_ld(domain,bc,shifts);}
diff --git a/src/Graph/dist_map_graph.hpp b/src/Graph/dist_map_graph.hpp
index 202137c93438d8351fcc22f92143570277be7ba6..e040c8983fb9fda3c5e5cd746ea9c9b4cc04d637 100644
--- a/src/Graph/dist_map_graph.hpp
+++ b/src/Graph/dist_map_graph.hpp
@@ -221,25 +221,25 @@ class DistGraph_CSR
 	size_t v_slot;
 
 	//! Structure that store the vertex properties
-	openfpm::vector<V, Memory, layout_v,layout_v_base,grow_p, openfpm::vect_isel<V>::value> v;
+	openfpm::vector<V, Memory,layout_v_base,grow_p, openfpm::vect_isel<V>::value> v;
 
 	//! Structure that store the vertex id and global id
-	openfpm::vector<v_info, Memory, typename memory_traits_lin<v_info>::type, memory_traits_lin, grow_p, openfpm::vect_isel<v_info>::value> v_m;
+	openfpm::vector<v_info, Memory, memory_traits_lin, grow_p, openfpm::vect_isel<v_info>::value> v_m;
 
 	//! Structure that store the number of adjacent vertex in e_l for each vertex
-	openfpm::vector<size_t, Memory, typename layout_v_base<size_t>::type, layout_v_base, grow_p, openfpm::vect_isel<size_t>::value> v_l;
+	openfpm::vector<size_t, Memory, layout_v_base, grow_p, openfpm::vect_isel<size_t>::value> v_l;
 
 	//! Structure that store the edge properties
-	openfpm::vector<E, Memory, layout_e, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e;
+	openfpm::vector<E, Memory, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e;
 
 	//! Structure that store the edge properties
-	openfpm::vector<e_info, Memory, typename layout_e_base<e_info>::type, layout_e_base, grow_p, openfpm::vect_isel<e_info>::value> e_m;
+	openfpm::vector<e_info, Memory, layout_e_base, grow_p, openfpm::vect_isel<e_info>::value> e_m;
 
 	//! Structure that store for each vertex the adjacent the vertex id and edge id (for property into e)
-	openfpm::vector<e_map, Memory, typename memory_traits_lin<e_map>::type, layout_e_base, grow_p, openfpm::vect_isel<e_map>::value> e_l;
+	openfpm::vector<e_map, Memory, layout_e_base, grow_p, openfpm::vect_isel<e_map>::value> e_l;
 
 	//! invalid edge element, when a function try to create an in valid edge this object is returned
-	openfpm::vector<E, Memory, layout_e, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e_invalid;
+	openfpm::vector<E, Memory, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e_invalid;
 
 	//! Map to access to the global vertex id given the vertex id
 	std::unordered_map<size_t, size_t> id2glb;
@@ -251,7 +251,7 @@ class DistGraph_CSR
 	std::unordered_map<size_t, size_t> glb2loc;
 
 	//! Struct containing the (sub)graph to send
-	typedef struct
+	struct SendGraphPack
 	{
 		//! vertex send buffer
 		openfpm::vector<V> send_v;
@@ -267,7 +267,7 @@ class DistGraph_CSR
 		openfpm::vector<size_t> send_es;
 		//! Indicates if the pack is empty or not
 		bool isEmpty = true;
-	} SendGraphPack;
+	};
 
 	//! Pack storing that data to send to other processors
 	openfpm::vector<SendGraphPack> sgp;
@@ -1001,10 +1001,10 @@ public:
 	typedef E E_type;
 
 	//! Object container for the vertex, for example can be encap<...> (map_grid or openfpm::vector)
-	typedef typename openfpm::vector<V, Memory, layout_v, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::container V_container;
+	typedef typename openfpm::vector<V, Memory, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::container V_container;
 
 	//! Object container for the edge, for example can be encap<...> (map_grid or openfpm::vector)
-	typedef typename openfpm::vector<E, Memory, layout_e, layout_e_base, grow_p, openfpm::vect_isel<E>::value>::container E_container;
+	typedef typename openfpm::vector<E, Memory, layout_e_base, grow_p, openfpm::vect_isel<E>::value>::container E_container;
 
 	/*! \brief It duplicate the graph
 	 *
@@ -1577,7 +1577,7 @@ public:
 	 * \return the number of childs
 	 *
 	 */
-	inline size_t getNChilds(typename openfpm::vector<V, Memory, layout_v, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & c)
+	inline size_t getNChilds(typename openfpm::vector<V, Memory, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & c)
 	{
 		return v_l.template get<0>(c.get());
 	}
@@ -1695,7 +1695,7 @@ public:
 	 * \return the target i connected by an edge node, for the node v
 	 *
 	 */
-	inline size_t getChild(typename openfpm::vector<V, Memory, layout_v, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & v, size_t i)
+	inline size_t getChild(typename openfpm::vector<V, Memory, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & v, size_t i)
 	{
 #ifdef DEBUG
 		if (i >= v_l.template get<0>(v.get()))
diff --git a/src/Grid/Iterators/grid_dist_id_iterator.hpp b/src/Grid/Iterators/grid_dist_id_iterator.hpp
index df83f0add9cca4ee213fb560f02e743efa864770..308567b31fff370d39ac7627b29548b65b03c940 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator.hpp
@@ -45,12 +45,14 @@ struct launch_insert_sparse_lambda_call<3>
 									   unsigned int blockId,
 									   itd_type itd,
 									   coord_type & key,
-									   coord_type & keyg,unsigned int offset, bool & is_block_empty)
+									   coord_type & keyg,unsigned int offset, bool & is_block_empty,
+									   bool is_in)
 	{
 #ifdef __NVCC__
 
-	    bool is_active = f1(keyg.get(0),keyg.get(1),keyg.get(2));
-	    is_active &= key.get(0) >= itd.start_base.get(0) && key.get(1) >= itd.start_base.get(1) && key.get(2) >= itd.start_base.get(2);
+	    bool is_active = false;
+		if (is_in == true)
+		{is_active = f1(keyg.get(0),keyg.get(1),keyg.get(2));}
 
 	    if (is_active == true)
 	    {is_block_empty = false;}
@@ -85,7 +87,8 @@ struct launch_insert_sparse_lambda_call<3>
 		keyg.set_d(1,key.get(1) + itg.origin.get(1));
 		keyg.set_d(2,key.get(2) + itg.origin.get(2));
 
-		if (key.get(0) > itg.stop.get(0) || key.get(1) > itg.stop.get(1) || key.get(2) > itg.stop.get(2))
+		if (key.get(0) > itg.stop.get(0)       || key.get(1) > itg.stop.get(1)       || key.get(2) > itg.stop.get(2) ||
+		    key.get(0) < itg.start_base.get(0) || key.get(1) < itg.start_base.get(1) || key.get(2) < itg.start_base.get(2))
 		{return true;}
 #endif
 		return false;
@@ -101,12 +104,14 @@ struct launch_insert_sparse_lambda_call<2>
 									   unsigned int blockId,
 									   itd_type itd,
 									   coord_type & key,
-									   coord_type & keyg,unsigned int offset, bool & is_block_empty)
+									   coord_type & keyg,unsigned int offset, bool & is_block_empty,
+									   bool is_in)
 	{
 #ifdef __NVCC__
 
-	    bool is_active = f1(keyg.get(0),keyg.get(1));
-	    is_active &= key.get(0) >= itd.start_base.get(0) && key.get(1) >= itd.start_base.get(1);
+	    bool is_active = false;
+		if (is_in == true)
+		{is_active = f1(keyg.get(0),keyg.get(1));}
 
 	    if (is_active == true)
 	    {is_block_empty = false;}
@@ -138,7 +143,8 @@ struct launch_insert_sparse_lambda_call<2>
 		keyg.set_d(0,key.get(0) + itg.origin.get(0));
 		keyg.set_d(1,key.get(1) + itg.origin.get(1));
 
-		if (key.get(0) > itg.stop.get(0) || key.get(1) > itg.stop.get(1))
+		if (key.get(0) > itg.stop.get(0)        || key.get(1) > itg.stop.get(1) ||
+		    key.get(0) < itg.start_base.get(0)  || key.get(1) < itg.start_base.get(1))
 		{return true;}
 #endif
 		return false;
@@ -155,20 +161,20 @@ struct launch_insert_sparse
 		grid_key_dx<grid_type::dims,int> key;
 		grid_key_dx<grid_type::dims,int> keyg;
 
-		if (launch_insert_sparse_lambda_call<grid_type::dims>::set_keys(key,keyg,itg) == true)	{return;}
+		bool not_active = launch_insert_sparse_lambda_call<grid_type::dims>::set_keys(key,keyg,itg);
 
-	    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
-	    {is_block_empty = true;}
+	    	if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+	    	{is_block_empty = true;}
 
 	    grid.init();
 
 	    int offset = 0;
-	    grid_key_dx<grid_type::dims,int> blk;
-	    bool out = grid.template getInsertBlockOffset<ite_type>(itg,key,blk,offset);
+	    	grid_key_dx<grid_type::dims,int> blk;
+	    	bool out = grid.template getInsertBlockOffset<ite_type>(itg,key,blk,offset);
 
-	    auto blockId = grid.getBlockLinId(blk);
+	    	auto blockId = grid.getBlockLinId(blk);
 
-	    launch_insert_sparse_lambda_call<grid_type::dims>::call(grid,f1,f2,blockId,itg,key,keyg,offset,is_block_empty);
+		launch_insert_sparse_lambda_call<grid_type::dims>::call(grid,f1,f2,blockId,itg,key,keyg,offset,is_block_empty,!not_active);
 
 	    __syncthreads();
 
diff --git a/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh b/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh
index 9fba5a678cca86127179d5ae8659b8dea8addf9d..66d1cbbdad2e316c22a311009a110cd122907de5 100644
--- a/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh
+++ b/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh
@@ -220,7 +220,9 @@ class grid_dist_id_iterator_gpu
 			Box<Decomposition::dims,int> range_box(start,stop);
 			Box<Decomposition::dims,int> kbox;
 			range_box -= gdb_ext.get(g_c).origin;
-			range_box.Intersect(gdb_ext.get(g_c).Dbox,kbox);
+			bool intersect = range_box.Intersect(gdb_ext.get(g_c).Dbox,kbox);
+
+			if (intersect == false)	{continue;}
 
 			auto & lg = loc_grids.get(g_c);
 
@@ -234,7 +236,7 @@ class grid_dist_id_iterator_gpu
 
 			for (int i = 0 ; i < Decomposition::dims ; i++)
 			{
-				itd.origin.set_d(i,gdb_ext.get(g_c).origin.get(i) + ite.start.get(i));
+				itd.origin.set_d(i,gdb_ext.get(g_c).origin.get(i));
 				itd.start_base.set_d(i,kbox.getKP1().get(i) % lg.getBlockEdgeSize() + ite.start.get(i));
 			}
 
diff --git a/src/Grid/cuda/grid_dist_id_kernels.cuh b/src/Grid/cuda/grid_dist_id_kernels.cuh
index 2af79459a3d6c82d6f6e8ab0315bc4e4f91eed53..e01fd5199584642bf9a5fa2d4c98a83647985efc 100644
--- a/src/Grid/cuda/grid_dist_id_kernels.cuh
+++ b/src/Grid/cuda/grid_dist_id_kernels.cuh
@@ -47,24 +47,28 @@ struct ite_gpu_dist
     						  key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\
 							  key.set_d(2,threadIdx.z + blockIdx.z * blockDim.z + ite_gpu.start.get(2));\
 							  \
+							  bool inactive = false;\
+							  \
 							  keyg.set_d(0,key.get(0) + ite_gpu.origin.get(0));\
     						  keyg.set_d(1,key.get(1) + ite_gpu.origin.get(1));\
 							  keyg.set_d(2,key.get(2) + ite_gpu.origin.get(2));\
 										 \
 										 if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\
-    									 {return;}
+    									 {inactive = true;}
 
 
 #define GRID_ID_2_GLOBAL(ite_gpu) grid_key_dx<2,int> key;\
 								  grid_key_dx<2,int> keyg;\
 							  key.set_d(0,threadIdx.x + blockIdx.x * blockDim.x + ite_gpu.start.get(0));\
     						  key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\
+							  \
+							  bool inactive = false;\
 							  \
 							  keyg.set_d(0,key.get(0) + ite_gpu.origin.get(0));\
     						  keyg.set_d(1,key.get(1) + ite_gpu.origin.get(1));\
 										 \
 										 if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1))\
-    									 {return;}
+    									 {inactive = true;}
 
 #endif
 
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 78da5846259b97c6f98b9efffc85ab149d88aef3..efa4a81557ff2396372fef3fbe514c949c01610d 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -26,6 +26,7 @@
 #include "grid_dist_id_comm.hpp"
 #include "HDF5_wr/HDF5_wr.hpp"
 #include "SparseGrid/SparseGrid.hpp"
+#include "lib/pdata.hpp"
 #ifdef __NVCC__
 #include "cuda/grid_dist_id_kernels.cuh"
 #include "Grid/cuda/grid_dist_id_iterator_gpu.cuh"
@@ -863,7 +864,21 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 		for (size_t i = 0 ; i < dim ; i++)
 		{
 			if (g_sz[i] < 2)
-				std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " distributed grids with size smaller than 2 are not supported\n";
+			{std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " distributed grids with size smaller than 2 are not supported\n";}
+		}
+	}
+
+	/*! \brief Check the domain is valid
+	 *
+	 * \param dom domain is valid
+	 *
+	 */
+	inline void check_domain(const Box<dim,St> & dom)
+	{
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (dom.getLow(i) >= dom.getHigh(i))
+			{std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " error the simulation domain is invalid\n";}
 		}
 	}
 
@@ -944,7 +959,7 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 	 * \param bc boundary conditions
 	 *
 	 */
-	inline void InitializeDecomposition(const size_t (& g_sz)[dim], const size_t (& bc)[dim])
+	inline void InitializeDecomposition(const size_t (& g_sz)[dim], const size_t (& bc)[dim], const grid_sm<dim,void> & g_dist = grid_sm<dim,void>())
 	{
 		// fill the global size of the grid
 		for (size_t i = 0 ; i < dim ; i++)	{this->g_sz[i] = g_sz[i];}
@@ -960,9 +975,15 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 		for (size_t i = 0 ; i < dim ; i++)
 		{div[i] = openfpm::math::round_big_2(pow(n_sub,1.0/dim));}
 
+		if (g_dist.size(0) != 0)
+		{
+			for (size_t i = 0 ; i < dim ; i++)
+			{div[i] = g_dist.size(i);}
+		}
+
 		// Create the sub-domains
 		dec.setParameters(div,domain,bc,ghost);
-		dec.decompose();
+		dec.decompose(dec_options::DEC_SKIP_ICELL);
 	}
 
 	/*! \brief Initialize the grid
@@ -1440,6 +1461,7 @@ public:
 		if (opt >> 32 != 0)
 		{this->setDecompositionGranularity(opt >> 32);}
 
+		check_domain(domain);
 		InitializeCellDecomposer(g_sz,p.bc);
 		InitializeDecomposition(g_sz, p.bc);
 		InitializeStructures(g_sz);
@@ -1456,7 +1478,7 @@ public:
      * \warning In very rare case the ghost part can be one point bigger than the one specified
      *
      */
-	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,long int> & g, const periodicity<dim> & p, size_t opt = 0)
+	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,long int> & g, const periodicity<dim> & p, size_t opt = 0, const grid_sm<dim,void> & g_dec = grid_sm<dim,void>())
 	:domain(domain),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz)
 	{
 #ifdef SE_CLASS2
@@ -1466,11 +1488,12 @@ public:
 		if (opt >> 32 != 0)
 		{this->setDecompositionGranularity(opt >> 32);}
 
+		check_domain(domain);
 		InitializeCellDecomposer(g_sz,p.bc);
 
 		ghost = convert_ghost(g,cd_sm);
 
-		InitializeDecomposition(g_sz,p.bc);
+		InitializeDecomposition(g_sz,p.bc,g_dec);
 
 		// an empty
 		openfpm::vector<Box<dim,long int>> empty;
@@ -1479,6 +1502,7 @@ public:
 		InitializeStructures(g_sz,empty,g,false);
 	}
 
+
 	/*! \brief It construct a grid on the full domain restricted
 	 *         to the set of boxes specified
 	 *
@@ -1504,6 +1528,7 @@ public:
 		check_new(this,8,GRID_DIST_EVENT,4);
 #endif
 
+		check_domain(domain);
 		InitializeCellDecomposer(g_sz,p.bc);
 
 		ghost = convert_ghost(g,cd_sm);
@@ -1743,6 +1768,11 @@ public:
 
 #ifdef __NVCC__
 
+	/*! \brief Insert point in the grid
+    *
+	* \param f1 lambda function to insert point
+	* \param f2 lambda function to set points
+	*/
 	template<typename lambda_t1, typename lambda_t2>
 	void addPoints(lambda_t1 f1, lambda_t2 f2)
 	{
@@ -1752,6 +1782,13 @@ public:
 		it.template launch<1>(launch_insert_sparse(),f1,f2);
 	}
 
+	/*! \brief Insert point in the grid between start and stop
+	*
+	* \param start point
+	* \param stop point
+	* \param f1 lambda function to insert point
+	* \param f2 lambda function to set points
+	*/
 	template<typename lambda_t1, typename lambda_t2>
 	void addPoints(grid_key_dx<dim> k1, grid_key_dx<dim> k2, lambda_t1 f1, lambda_t2 f2)
 	{
@@ -1985,6 +2022,18 @@ public:
 		return v_cl;
 	}
 
+	/*! \brief Eliminate many internal temporary buffer you can use this between flushes if you get some out of memory
+	 *
+	 *
+	 */
+	void removeUnusedBuffers()
+	{
+		for (int i = 0 ; i < loc_grid.size() ; i++)
+		{
+			loc_grid.get(i).removeUnusedBuffers();
+		}
+	}
+
 	/*! \brief Indicate that this grid is not staggered
 	 *
 	 * \return false
@@ -2074,10 +2123,7 @@ public:
 	 *
 	 */
 	template <unsigned int p,typename bg_key>inline auto insert(const grid_dist_key_dx<dim,bg_key> & v1)
-	-> typename std::add_lvalue_reference
-	<
-		decltype(loc_grid.get(v1.getSub()).template insert<p>(v1.getKey()))
-	>::type
+	-> decltype(loc_grid.get(v1.getSub()).template insert<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2104,10 +2150,7 @@ public:
 	 *
 	 */
 	template <unsigned int p,typename bg_key>inline auto insertFlush(const grid_dist_key_dx<dim,bg_key> & v1)
-	-> typename std::add_lvalue_reference
-	<
-		decltype(loc_grid.get(v1.getSub()).template insertFlush<p>(v1.getKey()))
-	>::type
+	-> decltype(loc_grid.get(v1.getSub()).template insertFlush<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2145,7 +2188,7 @@ public:
 	 */
 	template <unsigned int p, typename bg_key>
 	inline auto get(const grid_dist_key_dx<dim,bg_key> & v1)
-	-> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	-> decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2162,7 +2205,8 @@ public:
 	 *
 	 */
 	template <unsigned int p = 0>
-	inline auto get(const grid_dist_g_dx<device_grid> & v1) const -> typename std::add_lvalue_reference<decltype(v1.getSub()->template get<p>(v1.getKey()))>::type
+	inline auto get(const grid_dist_g_dx<device_grid> & v1) const
+	-> decltype(v1.getSub()->template get<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2179,7 +2223,7 @@ public:
 	 *
 	 */
 	template <unsigned int p = 0>
-	inline auto get(const grid_dist_g_dx<device_grid> & v1) -> typename std::add_lvalue_reference<decltype(v1.getSub()->template get<p>(v1.getKey()))>::type
+	inline auto get(const grid_dist_g_dx<device_grid> & v1) -> decltype(v1.getSub()->template get<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2196,7 +2240,7 @@ public:
 	 *
 	 */
 	template <unsigned int p = 0>
-	inline auto get(const grid_dist_lin_dx & v1) const -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	inline auto get(const grid_dist_lin_dx & v1) const -> decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2213,7 +2257,7 @@ public:
 	 *
 	 */
 	template <unsigned int p = 0>
-	inline auto get(const grid_dist_lin_dx & v1) -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	inline auto get(const grid_dist_lin_dx & v1) -> decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2566,6 +2610,35 @@ public:
 		}
 	}
 
+	/*! \brief apply a convolution using the stencil N
+	 *
+	 *
+	 */
+	template<unsigned int stencil_size, typename v_type, typename lambda_f, typename ... ArgsT >
+	void conv_cross_ids(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
+	{
+		for (int i = 0 ; i < loc_grid.size() ; i++)
+		{
+			Box<dim,long int> inte;
+
+			Box<dim,long int> base;
+			for (int j = 0 ; j < dim ; j++)
+			{
+				base.setLow(j,(long int)start.get(j) - (long int)gdb_ext.get(i).origin.get(j));
+				base.setHigh(j,(long int)stop.get(j) - (long int)gdb_ext.get(i).origin.get(j));
+			}
+
+			Box<dim,long int> dom = gdb_ext.get(i).Dbox;
+
+			bool overlap = dom.Intersect(base,inte);
+
+			if (overlap == true)
+			{
+				loc_grid.get(i).template conv_cross_ids<stencil_size,v_type>(inte.getKP1(),inte.getKP2(),func,args...);
+			}
+		}
+	}
+
 	/*! \brief apply a convolution using the stencil N
 	 *
 	 *
@@ -2982,7 +3055,11 @@ public:
 	{
 		// Save the background values
 		T bv;
-		meta_copy<T>::meta_copy_(bv,loc_grid.get(0).getBackgroundValue());
+
+		copy_aggregate_dual<decltype(loc_grid.get(0).getBackgroundValue()),
+				            T> ca(loc_grid.get(0).getBackgroundValue(),bv);
+
+		boost::mpl::for_each_ref<boost::mpl::range_c<int,0,T::max_prop>>(ca);
 
 		if (!(opt & NO_GDB_EXT_SWITCH))
 		{
@@ -3037,8 +3114,32 @@ public:
 		}
 		else
 		{
-			loc_grid.swap(loc_grid_old);
-			gdb_ext_old.swap(gdb_ext);
+			for (int i = 0 ; i < gdb_ext_old.size() ; i++)
+			{
+					auto & lg = loc_grid_old.get(i);
+					auto it_src = lg.getIterator(gdb_ext_old.get(i).Dbox.getKP1(),gdb_ext_old.get(i).Dbox.getKP2());
+					auto & dg = loc_grid.get(0);
+					grid_key_dx<dim> kp1 = gdb_ext.get(0).Dbox.getKP1();
+
+					grid_key_dx<dim> orig;
+					for (int j = 0 ; j < dim ; j++)
+					{
+							orig.set_d(j,gdb_ext_old.get(i).origin.get(j));
+					}
+
+					while (it_src.isNext())
+					{
+							auto key = it_src.get();
+							grid_key_dx<dim> key_dst;
+
+							for (int j = 0 ; j < dim ; j++)
+							{key_dst.set_d(j,key.get(j) + orig.get(j) + kp1.get(j));}
+
+							dg.get_o(key_dst) = lg.get_o(key);
+
+							++it_src;
+					}
+			}
 		}
 	}
 
@@ -3073,6 +3174,34 @@ public:
 		return this->ig_box;
 	}
 
+	void print_stats()
+	{
+		std::cout << "-- REPORT --" << std::endl;
+#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		std::cout << "Processor: " << v_cl.rank() << " Time spent in packing data: " << tot_pack << std::endl;
+		std::cout << "Processor: " << v_cl.rank() << " Time spent in sending and receving data: " << tot_sendrecv << std::endl;
+		std::cout << "Processor: " << v_cl.rank() << " Time spent in merging: " << tot_merge << std::endl;
+		std::cout << "Processor: " << v_cl.rank() << " Time spent in local merging: " << tot_loc_merge << std::endl;
+#else
+
+		std::cout << "Enable ENABLE_GRID_DIST_ID_PERF_STATS if you want to activate this feature" << std::endl;
+
+#endif
+	}
+
+	void clear_stats()
+	{
+#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		tot_pack = 0;
+		tot_sendrecv = 0;
+		tot_merge = 0;
+#else
+
+		std::cout << "Enable ENABLE_GRID_DIST_ID_PERF_STATS if you want to activate this feature" << std::endl;
+
+#endif
+	}
+
 #ifdef __NVCC__
 
 	/*! \brief Set the number inserts each GPU thread do
@@ -3176,16 +3305,21 @@ public:
 };
 
 
-template<unsigned int dim, typename St, typename T>
-using sgrid_dist_id = grid_dist_id<dim,St,T,CartDecomposition<dim,St>,HeapMemory,sgrid_cpu<dim,T,HeapMemory>>;
+template<unsigned int dim, typename St, typename T, typename Memory = HeapMemory, typename Decomposition = CartDecomposition<dim,St> >
+using sgrid_dist_id = grid_dist_id<dim,St,T,Decomposition,Memory,sgrid_cpu<dim,T,Memory>>;
 
-template<unsigned int dim, typename St, typename T>
-using sgrid_dist_soa = grid_dist_id<dim,St,T,CartDecomposition<dim,St>,HeapMemory,sgrid_soa<dim,T,HeapMemory>>;
+template<unsigned int dim, typename St, typename T, typename Memory = HeapMemory, typename Decomposition = CartDecomposition<dim,St>>
+using sgrid_dist_soa = grid_dist_id<dim,St,T,Decomposition,Memory,sgrid_soa<dim,T,Memory>>;
 
+template<unsigned int dim, typename St, typename T, typename devg, typename Memory = HeapMemory, typename Decomposition = CartDecomposition<dim,St>>
+using grid_dist_id_devg = grid_dist_id<dim,St,T,Decomposition,Memory,devg>;
 
 #ifdef __NVCC__
-template<unsigned int dim, typename St, typename T>
-using sgrid_dist_id_gpu = grid_dist_id<dim,St,T,CartDecomposition<dim,St,CudaMemory,memory_traits_inte>,CudaMemory,SparseGridGpu<dim,T>>;
+template<unsigned int dim, typename St, typename T, typename Memory = CudaMemory, typename Decomposition = CartDecomposition<dim,St,CudaMemory,memory_traits_inte> >
+using sgrid_dist_id_gpu = grid_dist_id<dim,St,T,Decomposition,Memory,SparseGridGpu<dim,T>>;
+
+template<unsigned int dim, typename St, typename T, typename Memory = CudaMemory, typename Decomposition = CartDecomposition<dim,St,CudaMemory,memory_traits_inte> >
+using sgrid_dist_sid_gpu = grid_dist_id<dim,St,T,Decomposition,Memory,SparseGridGpu<dim,T,default_edge<dim>::type::value,default_edge<dim>::tb::value,int>>;
 #endif
 
 #endif
diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp
index d9eefe479d0c1ca6dfb19c92547634cd37cbc3e2..5508aa8f1d5fc4ad356f6a01b989f4d72a824a61 100644
--- a/src/Grid/grid_dist_id_comm.hpp
+++ b/src/Grid/grid_dist_id_comm.hpp
@@ -12,6 +12,8 @@
 #include "Grid/copy_grid_fast.hpp"
 #include "grid_dist_util.hpp"
 #include "util/common_pdata.hpp"
+#include "lib/pdata.hpp"
+
 
 /*! \brief Unpack selector
  *
@@ -159,7 +161,7 @@ class grid_dist_id_comm
 	openfpm::vector<size_t> send_size;
 
 	//! receiving buffers in case of dynamic
-	openfpm::vector<BMemory<Memory>> recv_buffers;
+	openfpm::vector_fr<BMemory<Memory>> recv_buffers;
 
 	struct rp_id
 	{
@@ -191,6 +193,11 @@ class grid_dist_id_comm
 	openfpm::vector<void *> pointers;
 	openfpm::vector<void *> pointers2;
 
+	//! header unpacker info
+	openfpm::vector_gpu<aggregate<void *,void *,int>> pointers_h;
+	int n_headers_slot = 1;
+	openfpm::vector_gpu<aggregate<size_t,size_t,unsigned int>> headers;
+
 	//! Receiving option
 	size_t opt;
 
@@ -437,9 +444,9 @@ class grid_dist_id_comm
 
 			if (send_prc_queue.size() == 0)
 			{
-                                v_cl.sendrecvMultipleMessagesNBX(send_prc_queue.size(),NULL,
-                                                                                         NULL,NULL,
-                                                                                         receive_dynamic,this);
+                v_cl.sendrecvMultipleMessagesNBX(send_prc_queue.size(),NULL,
+                                                                        NULL,NULL,
+                                                                        receive_dynamic,this);
 			}
 			else
 			{
@@ -452,7 +459,7 @@ class grid_dist_id_comm
 
 			recv_proc.sort();
 
-			openfpm::vector<BMemory<Memory>> tmp;
+			openfpm::vector_fr<BMemory<Memory>> tmp;
 			tmp.resize(recv_proc.size());
 
 			for (int i = 0 ; i < recv_proc.size() ; i++)
@@ -577,7 +584,7 @@ class grid_dist_id_comm
 		size_t sub_id = eg_box.get(ei).bid.get(le_id).sub;
 
 		// sub-grid where to unpack
-		auto sub2 = loc_grid.get(sub_id).getIterator(box.getKP1(),box.getKP2());
+		auto sub2 = loc_grid.get(sub_id).getIterator(box.getKP1(),box.getKP2(),false);
 
 		rem_copy_opt opt_ = rem_copy_opt::NONE_OPT;
 		if (opt & SKIP_LABELLING)
@@ -605,7 +612,132 @@ class grid_dist_id_comm
 		}
 	}
 
+	template<typename mem, typename header_type,unsigned ... prp>
+	void unpack_data_to_ext_ghost_with_header(ExtPreAlloc<mem> & emem,
+									openfpm::vector<device_grid> & loc_grid,
+									header_type & headers,
+									size_t i,
+									const openfpm::vector<ep_box_grid<dim>> & eg_box,
+									const std::unordered_map<size_t,size_t> & g_id_to_external_ghost_box,
+									const openfpm::vector<e_box_multi<dim>> & eb_gid_list,
+									Unpack_stat & ps,
+									size_t opt)
+	{
+		// Unpack the ghost box global-id
+
+		size_t g_id;
+		// we move from device to host the gid
+		g_id = headers.template get<0>(i);
+		ps.addOffset(sizeof(size_t));
+
+		size_t l_id = 0;
+		// convert the global id into local id
+		auto key = g_id_to_external_ghost_box.find(g_id);
+
+		if (key != g_id_to_external_ghost_box.end()) // FOUND
+		{l_id = key->second;}
+		else
+		{
+			// NOT FOUND
+
+			// It must be always found, if not it mean that the processor has no-idea of
+			// what is stored and conseguently do not know how to unpack, print a critical error
+			// and return
+
+			std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Critical, cannot unpack object, because received data cannot be interpreted\n";
+
+			return;
+		}
+
+
+		// we unpack into the last eb_gid_list that is always big enought to
+		// unpack the information
+
+		size_t le_id = eb_gid_list.get(l_id).full_match;
+		size_t ei =	eb_gid_list.get(l_id).e_id;
+
+		// Get the external ghost box associated with the packed information
+		Box<dim,long int> box = eg_box.get(ei).bid.get(le_id).l_e_box;
+		size_t sub_id = eg_box.get(ei).bid.get(le_id).sub;
+
+		// sub-grid where to unpack
+		auto sub2 = loc_grid.get(sub_id).getIterator(box.getKP1(),box.getKP2(),false);
+
+		rem_copy_opt opt_ = rem_copy_opt::NONE_OPT;
+		if (opt & SKIP_LABELLING)
+		{opt_ = rem_copy_opt::KEEP_GEOMETRY;}
 
+		// Unpack
+		loc_grid.get(sub_id).remove(box);
+		Unpacker<device_grid,mem>::template unpack_with_header<decltype(sub2),decltype(headers),decltype(v_cl.getmgpuContext()),prp...>
+																				(emem,
+																				sub2,
+																				loc_grid.get(sub_id),
+																				headers,
+																				i,
+																				ps,
+																				v_cl.getmgpuContext(),
+																				opt_);
+
+		// Copy the information on the other grid
+		for (long int j = 0 ; j < (long int)eb_gid_list.get(l_id).eb_list.size() ; j++)
+		{
+			size_t nle_id = eb_gid_list.get(l_id).eb_list.get(j);
+			if (nle_id != le_id)
+			{
+//				size_t nle_id = eb_gid_list.get(l_id).eb_list.get(j);
+				size_t n_sub_id = eg_box.get(ei).bid.get(nle_id).sub;
+
+				Box<dim,long int> box = eg_box.get(ei).bid.get(nle_id).l_e_box;
+				Box<dim,long int> rbox = eg_box.get(ei).bid.get(nle_id).lr_e_box;
+
+				loc_grid.get(n_sub_id).remove(box);
+				loc_grid.get(n_sub_id).copy_to(loc_grid.get(sub_id),rbox,box);
+			}
+		}
+	}
+
+	template<unsigned int ... prp>
+	void fill_headers(size_t opt)
+	{
+		if ((opt & KEEP_PROPERTIES) == 0 && device_grid::is_unpack_header_supported())
+		{
+			headers.resize(n_headers_slot * recv_buffers.size());
+
+			Memory result;
+			result.allocate(sizeof(int));
+
+			pointers_h.resize(recv_buffers.size());
+
+			for ( size_t i = 0 ; i < recv_buffers.size() ; i++ )
+			{
+				pointers_h.template get<0>(i) = recv_buffers.get(i).getDevicePointer();
+				pointers_h.template get<1>(i) = (unsigned char *)recv_buffers.get(i).getDevicePointer() + recv_buffers.get(i).size();
+			}
+
+			pointers_h.template hostToDevice<0,1>();
+
+			while(1)
+			{
+				for ( size_t i = 0 ; i < recv_buffers.size() ; i++ )
+				{pointers_h.template get<2>(i) = 0;}
+				pointers_h.template hostToDevice<2>();
+				*(int *)result.getPointer() = 0;
+				result.hostToDevice();
+
+				device_grid::template unpack_headers<decltype(pointers_h),decltype(headers),decltype(result),prp ...>(pointers_h,headers,result,n_headers_slot);
+				result.deviceToHost();
+
+				if (*(int *)result.getPointer() == 0) {break;}
+
+				n_headers_slot *= 2;
+				headers.resize(n_headers_slot * recv_buffers.size());
+
+			}
+
+			headers.template deviceToHost<0,1,2>();
+		}
+	}
 
 	template<unsigned ... prp>
 	void merge_received_data_get(openfpm::vector<device_grid> & loc_grid,
@@ -642,22 +774,52 @@ class grid_dist_id_comm
 		}
 		else
 		{
-			// Unpack the object
-			for ( size_t i = 0 ; i < recv_buffers.size() ; i++ )
+			fill_headers<prp ...>(opt);
+
+			if (headers.size() != 0)
 			{
-				Unpack_stat ps;
-				size_t mark_here = ps.getOffset();
+				// Unpack the object
+				for ( size_t i = 0 ; i < recv_buffers.size() ; i++ )
+				{
+					Unpack_stat ps;
+					size_t mark_here = ps.getOffset();
 
-				ExtPreAlloc<BMemory<Memory>> mem(recv_buffers.get(i).size(),recv_buffers.get(i));
+					ExtPreAlloc<BMemory<Memory>> mem(recv_buffers.get(i).size(),recv_buffers.get(i));
 
-				// for each external ghost box
-				while (ps.getOffset() - mark_here < recv_buffers.get(i).size())
+					int j = 0;
+
+					// for each external ghost box
+					while (ps.getOffset() - mark_here < recv_buffers.get(i).size())
+					{
+						// Unpack the ghost box global-id
+
+						unpack_data_to_ext_ghost_with_header<BMemory<Memory>,decltype(headers),prp ...>(mem,loc_grid,headers,i*n_headers_slot+j,
+																	eg_box,g_id_to_external_ghost_box,eb_gid_list,
+																	ps,opt);
+
+						j++;
+					}
+				}
+			}
+			else
+			{
+				// Unpack the object
+				for ( size_t i = 0 ; i < recv_buffers.size() ; i++ )
 				{
-					// Unpack the ghost box global-id
+					Unpack_stat ps;
+					size_t mark_here = ps.getOffset();
 
-					unpack_data_to_ext_ghost<BMemory<Memory>,prp ...>(mem,loc_grid,i,
-																eg_box,g_id_to_external_ghost_box,eb_gid_list,
-																ps,opt);
+					ExtPreAlloc<BMemory<Memory>> mem(recv_buffers.get(i).size(),recv_buffers.get(i));
+
+					// for each external ghost box
+					while (ps.getOffset() - mark_here < recv_buffers.get(i).size())
+					{
+						// Unpack the ghost box global-id
+
+						unpack_data_to_ext_ghost<BMemory<Memory>,prp ...>(mem,loc_grid,i,
+																	eg_box,g_id_to_external_ghost_box,eb_gid_list,
+																	ps,opt);
+					}
 				}
 			}
 		}
@@ -1060,6 +1222,11 @@ public:
 		for (int i = 0 ; i < loc_grid.size() ; i++)
 		{opt &= (loc_grid.get(i).isSkipLabellingPossible())?(int)-1:~SKIP_LABELLING;}
 
+		#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		timer packing_time;
+		packing_time.start();
+		#endif
+
 		if (!(opt & SKIP_LABELLING))
 		{
 			// first we initialize the pack buffer on all internal grids
@@ -1086,7 +1253,7 @@ public:
 					// Pack a size_t for the internal ghost id
 					Packer<size_t,Memory>::packRequest(req);
 					// Create a sub grid iterator spanning the internal ghost layer
-					auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2());
+					auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2(),false);
 
 					// get the size to pack
 					Packer<device_grid,Memory>::template packRequest<decltype(sub_it),prp...>(loc_grid.get(sub_id),sub_it,req);
@@ -1142,7 +1309,7 @@ public:
 					Packer<size_t,Memory>::pack(prAlloc_prp,g_id,sts);
 					prAlloc_prp.hostToDevice(prAlloc_prp.getOffset(),prAlloc_prp.getOffsetEnd());
 					// Create a sub grid iterator spanning the internal ghost layer
-					auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2());
+					auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2(),false);
 					// and pack the internal ghost grid
 					Packer<device_grid,Memory>::template pack<decltype(sub_it),prp...>(prAlloc_prp,loc_grid.get(sub_id),sub_it,sts);
 				}
@@ -1192,6 +1359,13 @@ public:
 			delete &prAlloc_prp;
 		}
 
+		#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		packing_time.stop();
+		tot_pack += packing_time.getwct();
+		timer sendrecv_time;
+		sendrecv_time.start();
+		#endif
+
 		for ( size_t i = 0 ; i < ig_box.size() ; i++ )
 		{
 			// This function send (or queue for sending) the information
@@ -1210,8 +1384,22 @@ public:
 
 		queue_recv_data_get<prp_object>(eg_box,prp_recv,prRecv_prp);
 
+		#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		sendrecv_time.stop();
+		tot_sendrecv += sendrecv_time.getwct();
+		timer merge_loc_time;
+		merge_loc_time.start();
+		#endif
+
 		ghost_get_local<prp...>(loc_ig_box,loc_eg_box,gdb_ext,loc_grid,g_id_to_external_ghost_box,ginfo,use_bx_def,opt);
 
+		#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		merge_loc_time.stop();
+		tot_loc_merge += merge_loc_time.getwct();
+		timer merge_time;
+		merge_time.start();
+		#endif
+
 		for (size_t i = 0 ; i < loc_grid.size() ; i++)
 		{loc_grid.get(i).removeAddUnpackReset();}
 
@@ -1224,6 +1412,11 @@ public:
 		for (size_t i = 0 ; i < loc_grid.size() ; i++)
 		{loc_grid.get(i).template removeAddUnpackFinalize<prp ...>(v_cl.getmgpuContext(),opt_);}
 
+		#ifdef ENABLE_GRID_DIST_ID_PERF_STATS
+		merge_time.stop();
+		tot_merge += merge_time.getwct();
+		#endif
+
 		prRecv_prp.decRef();
 		delete &prRecv_prp;
 	}
diff --git a/src/Grid/grid_dist_util.hpp b/src/Grid/grid_dist_util.hpp
index b254b6fd7c163bf5243a5eed0338d7c25cd630bd..508764d50e6557e2394f2295b8990d0c574c98f2 100644
--- a/src/Grid/grid_dist_util.hpp
+++ b/src/Grid/grid_dist_util.hpp
@@ -138,13 +138,13 @@ inline void create_gdb_ext(openfpm::vector<GBoxes<Decomposition::dims>> & gdb_ex
 		SpaceBox<Decomposition::dims,long int> sp_t = cd_sm.convertDomainSpaceIntoGridUnits(sp,dec.periodicity());
 		SpaceBox<Decomposition::dims,long int> sp_tg = cd_sm.convertDomainSpaceIntoGridUnits(sp_g,dec.periodicity());
 
-/*		for (size_t i = 0 ; i < Decomposition::dims ; i++)
+		for (size_t i = 0 ; i < Decomposition::dims ; i++)
 		{
 			if (sp_t.getLow(i) < sp_tg.getLow(i))
 			{sp_tg.setLow(i,sp_t.getLow(i));}
 			if (sp_t.getHigh(i) > sp_tg.getHigh(i))
 			{sp_tg.setHigh(i,sp_t.getHigh(i));}
-		}*/
+		}
 
 		if (use_bx_def == true)
 		{
diff --git a/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp b/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
index 1174ad725b88e71adc2d2b7d11ba644c0af15a09..3c6fd85e8108d6ebb2a0ca0951396dab0afa3e79 100644
--- a/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
+++ b/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
@@ -145,6 +145,85 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test )
 }
 
 
+BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_copy_test )
+{
+
+	// Input data
+	size_t k = 2400;
+
+	float ghost_part = 0.01;
+
+	// Domain
+	Box<2,float> domain({0.0,0.0},{1.0,1.0});
+
+	Vcluster<> & v_cl = create_vcluster();
+
+	// Skip this test on big scale
+	if (v_cl.getProcessingUnits() >= 32)
+		return;
+
+	// grid size
+	size_t sz[2];
+	sz[0] = k;
+	sz[1] = k;
+
+	// Ghost
+	Ghost<2,float> g(ghost_part);
+
+	// Distributed grid with id decomposition
+	grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
+	grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist_copy(g_dist.getDecomposition(),sz,g);
+
+	g_dist.load("test_data/test_data_three.h5");
+
+	// Copy
+
+	auto dom_sc = g_dist.getDomainIterator();
+    auto dom_ds = g_dist_copy.getDomainIterator();
+    while (dom_sc.isNext())
+    {
+        auto key_sc = dom_sc.get();
+        auto key_ds = dom_ds.get();
+        g_dist_copy.template get<0>(key_ds) = g_dist.template get<0>(key_sc);
+        ++dom_sc;
+        ++dom_ds;
+    }
+
+
+	auto it = g_dist_copy.getDomainIterator();
+
+	size_t count = 0;
+
+	bool match = true;
+	while (it.isNext())
+	{
+		//key
+		auto key = it.get();
+
+		//BOOST_CHECK_CLOSE(g_dist.template get<0>(key),1,0.0001);
+		//std::cout << "Element: " << g_dist.template get<0>(key) << std::endl;
+
+		auto keyg = g_dist_copy.getGKey(key);
+
+		match &= g_dist_copy.template get<0>(key) == keyg.get(0);
+
+		++it;
+		count++;
+	}
+
+	openfpm::vector<size_t> count_total;
+	v_cl.allGather(count,count_total);
+	v_cl.execute();
+
+	size_t sum = 0;
+
+	for (size_t i = 0; i < count_total.size(); i++)
+		sum += count_total.get(i);
+
+	BOOST_REQUIRE_EQUAL(sum, (size_t)k*k);
+	BOOST_REQUIRE_EQUAL(match,true);
+}
+
 BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test_diff_proc )
 {
 
diff --git a/src/Grid/tests/grid_dist_id_unit_test.cpp b/src/Grid/tests/grid_dist_id_unit_test.cpp
index d31a763ae0fdd69a190b43234280d1e93dfbb049..a6d4061e0c2be72e30cb247cf2c761c23f508915 100644
--- a/src/Grid/tests/grid_dist_id_unit_test.cpp
+++ b/src/Grid/tests/grid_dist_id_unit_test.cpp
@@ -1313,7 +1313,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 		periodicity<3> pr = {{PERIODIC,PERIODIC,PERIODIC}};
 
 		// Distributed grid with id decomposition
-		grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g,pr);
+		grid_dist_id<3, float, aggregate<long int,double>, CartDecomposition<3,float>> g_dist(sz,domain,g,pr);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -1332,6 +1332,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 			auto key = dom.get();
 
 			g_dist.template get<0>(key) = -6.0;
+			g_dist.template get<1>(key) = -6.0;
 
 			// Count the points
 			count++;
@@ -1356,6 +1357,14 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 			g_dist.template get<0>(key.move(2,1)) += 1.0;
 			g_dist.template get<0>(key.move(2,-1)) += 1.0;
 
+
+			g_dist.template get<1>(key.move(0,1)) += 1.0;
+			g_dist.template get<1>(key.move(0,-1)) += 1.0;
+			g_dist.template get<1>(key.move(1,1)) += 1.0;
+			g_dist.template get<1>(key.move(1,-1)) += 1.0;
+			g_dist.template get<1>(key.move(2,1)) += 1.0;
+			g_dist.template get<1>(key.move(2,-1)) += 1.0;
+
 			++dom;
 		}
 		}
@@ -1375,12 +1384,14 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 		}
 
 		g_dist.ghost_put<add_,0>();
+		g_dist.ghost_put<add_,1>();
+
 
 		if (count != 0)
 			BOOST_REQUIRE_EQUAL(correct, false);
 
 		// sync the ghosts
-		g_dist.ghost_get<0>();
+		g_dist.ghost_get<0,1>();
 
 		correct = true;
 
@@ -1392,6 +1403,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 			auto key = dom_gi2.get();
 
 			correct &= (g_dist.template get<0>(key) == 0);
+			correct &= (g_dist.template get<1>(key) == 0);
 
 			++dom_gi2;
 		}
@@ -2397,51 +2409,258 @@ BOOST_AUTO_TEST_CASE( grid_dist_domain_ghost_3D_put_create_check )
 	TestXD_ghost_put_create(sg_dist3,k);
 }
 
+
 BOOST_AUTO_TEST_CASE( grid_dist_ghost_zero_size )
+{
+        // Test grid periodic
+
+        Box<3,double> domain({0,0,0},{365.376,365.376,102});
+
+        Vcluster<> & v_cl = create_vcluster();
+
+        if ( v_cl.getProcessingUnits() > 32 )
+        {return;}
+
+        BOOST_TEST_CHECKPOINT( "Testing grid zero ghost");
+
+        // grid size
+        size_t sz[3];
+        sz[0] = 53;
+        sz[1] = 53;
+        sz[2] = 10;
+
+        // Ghost
+        Ghost<3,long int> g(0);
+
+        // periodicity
+        periodicity<3> pr = {{NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}};
+
+        // Distributed grid with id decomposition
+        grid_dist_id<3, double, aggregate<long int, int>> g_dist(sz,domain,g,pr);
+
+        auto it = g_dist.getDomainIterator();
+
+        size_t count = 0;
+
+        while (it.isNext())
+        {
+                auto k = it.get();
+
+                ++count;
+
+                ++it;
+        }
+
+        v_cl.sum(count);
+        v_cl.execute();
+
+        BOOST_REQUIRE_EQUAL(count,53*53*10);
+}
+
+
+BOOST_AUTO_TEST_CASE(grid_dist_id_smb_write_out_1_proc)
 {
 	// Test grid periodic
+	{
+		Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
 
-	Box<3,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
+		Vcluster<> & v_cl = create_vcluster();
 
-	Vcluster<> & v_cl = create_vcluster();
+		if ( v_cl.getProcessingUnits() > 1 )
+		{return;}
 
-	if ( v_cl.getProcessingUnits() > 32 )
-	{return;}
+		// grid size
+		size_t sz[2];
+		sz[0] = 16;
+		sz[1] = 16;
 
-	BOOST_TEST_CHECKPOINT( "Testing grid zero ghost");
+		// Ghost
+		Ghost<2,long int> g(0);
 
-	// grid size
-	size_t sz[3];
-	sz[0] = 32;
-	sz[1] = 32;
-	sz[2] = 32;
+		// periodicity
+		periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}};
 
-	// Ghost
-	Ghost<3,long int> g(0);
+		typedef grid_cpu<2, aggregate<int>, grid_smb<2,4> > devg; 
 
-	// periodicity
-	periodicity<3> pr = {{NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}};
+		// Distributed grid with id decomposition
+		grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr);
 
-	// Distributed grid with id decomposition
-	grid_dist_id<3, float, aggregate<long int, int>> g_dist(sz,domain,g,pr);
+		auto it = g_smb.getDomainIterator();
 
-	auto it = g_dist.getDomainIterator();
+		size_t count = 0;
 
-	size_t count = 0;
+		unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>();
 
-	while (it.isNext())
+		while (it.isNext())
+		{
+			auto k = it.get();
+
+			g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base;
+
+			++count;
+
+			++it;
+		}
+
+		v_cl.sum(count);
+		v_cl.execute();
+
+		BOOST_REQUIRE_EQUAL(count,16*16);
+
+		g_smb.write("g_smb_out");
+	}
+}
+
+BOOST_AUTO_TEST_CASE(grid_dist_id_zmb_write_out_1_proc)
+{
 	{
-		auto k = it.get();
+		// Test grid periodic
 
-		++count;
+		Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
 
-		++it;
+		Vcluster<> & v_cl = create_vcluster();
+
+		if ( v_cl.getProcessingUnits() > 1 )
+		{return;}
+
+		// grid size
+		size_t sz[2];
+		sz[0] = 16;
+		sz[1] = 16;
+
+		// Ghost
+		Ghost<2,long int> g(0);
+
+		// periodicity
+		periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}};
+
+		typedef grid_cpu<2, aggregate<int>, grid_zmb<2,4,long int> > devg; 
+
+		// Distributed grid with id decomposition
+		grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr);
+
+		auto it = g_smb.getDomainIterator();
+
+		size_t count = 0;
+
+		unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>();
+
+		while (it.isNext())
+		{
+			auto k = it.get();
+
+			g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base;
+
+			++count;
+
+			++it;
+		}
+
+		v_cl.sum(count);
+		v_cl.execute();
+
+		BOOST_REQUIRE_EQUAL(count,16*16);
+
+		g_smb.write("g_zmb_out");
 	}
 
-	v_cl.sum(count);
-	v_cl.execute();
+	{
+		Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
+
+		Vcluster<> & v_cl = create_vcluster();
+
+		if ( v_cl.getProcessingUnits() > 1 )
+		{return;}
+
+		// grid size
+		size_t sz[2];
+		sz[0] = 16;
+		sz[1] = 16;
+
+		// Ghost
+		Ghost<2,long int> g(0);
+
+		// periodicity
+		periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}};
+
+		typedef grid_cpu<2, aggregate<int>, grid_zm<2,void> > devg; 
+
+		// Distributed grid with id decomposition
+		grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr);
+
+		auto it = g_smb.getDomainIterator();
+
+		size_t count = 0;
+
+		unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>();
+
+		while (it.isNext())
+		{
+			auto k = it.get();
+
+			g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base;
 
-	BOOST_REQUIRE_EQUAL(count,32*32*32);
+			++count;
+
+			++it;
+		}
+
+		v_cl.sum(count);
+		v_cl.execute();
+
+		BOOST_REQUIRE_EQUAL(count,16*16);
+
+		g_smb.write("g_zm_out");
+	}
+
+	{
+		Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
+
+		Vcluster<> & v_cl = create_vcluster();
+
+		if ( v_cl.getProcessingUnits() > 1 )
+		{return;}
+
+		// grid size
+		size_t sz[2];
+		sz[0] = 16;
+		sz[1] = 16;
+
+		// Ghost
+		Ghost<2,long int> g(0);
+
+		// periodicity
+		periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}};
+
+		typedef grid_base<2, aggregate<int>> devg; 
+
+		// Distributed grid with id decomposition
+		grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr);
+
+		auto it = g_smb.getDomainIterator();
+
+		size_t count = 0;
+
+		unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>();
+
+		while (it.isNext())
+		{
+			auto k = it.get();
+
+			g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base;
+
+			++count;
+
+			++it;
+		}
+
+		v_cl.sum(count);
+		v_cl.execute();
+
+		BOOST_REQUIRE_EQUAL(count,16*16);
+
+		g_smb.write("g_sm_out");
+	}
 }
 
 BOOST_AUTO_TEST_CASE( grid_dist_copy_construct )
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index 5a221b6b18574225952f5602b49d959e0fe36efb..e26743dfb8c54d8b0e91ae85fc3c4fe67d5fa636 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -1,5 +1,7 @@
+#include <ostream>
 #define BOOST_TEST_DYN_LINK
 
+#include "config.h"
 #include <boost/test/unit_test.hpp>
 #include "Grid/grid_dist_id.hpp"
 
@@ -15,7 +17,8 @@ struct insert_kernel2D
 
 	    sg.init();
 
-	    sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1);
+		if (inactive == false)
+	    {sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1);}
 
 	    __syncthreads();
 
@@ -33,7 +36,8 @@ struct insert_kernel3D
 
 	    sg.init();
 
-	    sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1) + keyg.get(2);
+		if (inactive == false)
+	    {sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1) + keyg.get(2);}
 
 	    __syncthreads();
 
@@ -41,15 +45,6 @@ struct insert_kernel3D
 	}
 };
 
-template<unsigned int p>
-struct stencil_kernel
-{
-	template<typename SparseGridGpu_type>
-	__device__ void operator()(SparseGridGpu_type & sg, ite_gpu<SparseGridGpu_type::d> & ite, float c)
-	{
-		// TODO
-	}
-};
 
 BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
 {
@@ -212,12 +207,10 @@ void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2])
 	gdist.template flush<smax_<0>>(flush_type::FLUSH_ON_DEVICE);
 
 	gdist.template deviceToHost<0>();
-	gdist.write_debug("before_ghost");
 
 	gdist.template ghost_get<0>(RUN_ON_DEVICE);
 
 	gdist.template deviceToHost<0>();
-	gdist.write_debug("after_ghost");
 
 	// Now we check that ghost is correct
 
@@ -297,6 +290,8 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
 	size_t sz6[2] = {15,15};
 	sgrid_ghost_get(sz,sz6);
 
+	return;
+
 	size_t sz2[2] = {170,170};
 	size_t sz3[2] = {15,15};
 	sgrid_ghost_get(sz2,sz3);
@@ -306,6 +301,90 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
 }
 
 
+BOOST_AUTO_TEST_CASE( sgrid_gpu_app_point_test )
+{
+	size_t sz[3] = {75,75,75};
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+
+	Ghost<3,long int> g(1);
+
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>> gdist(sz,domain,g,bc);
+
+	gdist.template setBackgroundValue<0>(666);
+	gdist.template setBackgroundValue<1>(666);
+	gdist.template setBackgroundValue<2>(666);
+	gdist.template setBackgroundValue<3>(666);
+
+	/////// GPU insert + flush
+
+	Box<3,size_t> box({1,1,1},{sz[0],sz[1],sz[2]});
+
+	/////// GPU Run kernel
+
+	float c = 5.0;
+
+	typedef typename GetAddBlockType<decltype(gdist)>::type InsertBlockT;
+
+	CudaMemory cmem;
+	cmem.allocate(sizeof(int));
+	CudaMemory cmem_out;
+	cmem_out.allocate(sizeof(int));
+
+	*(int *)cmem.getPointer() = 0.0;
+	*(int *)cmem_out.getPointer() = 0.0;
+
+	cmem.hostToDevice();
+	cmem_out.hostToDevice();
+
+	int * cnt = (int *)cmem.getDevicePointer();
+	int * cnt_out = (int *)cmem_out.getDevicePointer();
+
+	Box<3,size_t> bx({23,23,23},{70,70,70});
+
+	gdist.addPoints(bx.getKP1(),bx.getKP2(),
+			        [cnt,cnt_out,bx] __device__ (int i, int j, int k)
+			        {
+						Point<3,int> p({i,j,k});
+
+						if (bx.isInside(p))
+						{atomicAdd(cnt,1);}
+						else
+						{
+							printf("%d %d %d \n",i,j,k);
+							atomicAdd(cnt_out,1);
+						}
+
+						return true;
+			        },
+			        [c] __device__ (InsertBlockT & data, int i, int j, int k)
+			        {
+			        	data.template get<0>() = c + i + j;
+			        	data.template get<1>() = c + 1000 + i + j;
+			        }
+			        );
+
+	gdist.template flush<smax_<0>,smax_<1>>(flush_type::FLUSH_ON_DEVICE);
+	gdist.template ghost_get<0,1>(RUN_ON_DEVICE);
+
+	cmem.deviceToHost();
+	cmem_out.deviceToHost();
+
+	int cnt_host = *(int *)cmem.getPointer();
+	int cnt_host_out = *(int *)cmem_out.getPointer();
+
+	auto & v_cl = create_vcluster();
+
+	v_cl.sum(cnt_host_out);
+	v_cl.sum(cnt_host);
+	v_cl.execute();
+
+	BOOST_REQUIRE_EQUAL(cnt_host_out,0);
+	BOOST_REQUIRE_EQUAL(cnt_host,bx.getVolumeKey());
+}
+
+
 BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test )
 {
 	size_t sz[2] = {164,164};
@@ -389,15 +468,17 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test )
 		++it3;
 	}
 
-	gdist.write("SGRID");
-
 	BOOST_REQUIRE_EQUAL(match,true);
 }
 
 
 BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test_3d )
 {
+	#ifdef CUDA_ON_CPU
+	size_t sz[3] = {20,20,20};
+	#else
 	size_t sz[3] = {60,60,60};
+	#endif
 	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
 
 	Ghost<3,long int> g(1);
diff --git a/src/Grid/tests/sgrid_dist_id_unit_tests.cpp b/src/Grid/tests/sgrid_dist_id_unit_tests.cpp
index 4ebcdebabbb69a44c4a566ecb8ab78184a346176..1757dfa31ba7e6d3cb917d04cad2e9f95ad93c16 100644
--- a/src/Grid/tests/sgrid_dist_id_unit_tests.cpp
+++ b/src/Grid/tests/sgrid_dist_id_unit_tests.cpp
@@ -161,8 +161,6 @@ BOOST_AUTO_TEST_CASE( sgrid_dist_id_basic_test_2D)
 		++it;
 	}
 
-	sg.write("sg_test_write");
-
 	bool match = true;
 	auto it2 = sg.getGridIterator();
 
@@ -614,7 +612,7 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2_cross
 
     auto func = [uFactor,vFactor,deltaT,F,K](Vc::double_v & u_out,Vc::double_v & v_out,
     															Vc::double_v & u,Vc::double_v & v,
-                                                                cross_stencil_v & us,cross_stencil_v & vs,
+                                                                cross_stencil_v<double> & us,cross_stencil_v<double> & vs,
                                                                 unsigned char * mask){
 
 																														 u_out = u + uFactor *(us.xm + us.xp +
@@ -691,4 +689,250 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2_cross
     BOOST_REQUIRE_EQUAL(match,true);
 }
 
+BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2_crossing_float)
+{
+	constexpr int U = 0;
+	constexpr int V = 1;
+
+	constexpr int U_next = 2;
+	constexpr int V_next = 3;
+
+	constexpr int x = 0;
+	constexpr int y = 1;
+	constexpr int z = 2;
+
+    Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+
+    // grid size
+    size_t sz[3] = {32,32,32};
+
+    // Define periodicity of the grid
+    periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+
+    // Ghost in grid unit
+    Ghost<3,long int> g(1);
+
+    // deltaT
+    float deltaT = 1;
+
+    // Diffusion constant for specie U
+    float du = 2*1e-5;
+
+    // Diffusion constant for specie V
+    float dv = 1*1e-5;
+
+    // Number of timesteps
+    size_t timeSteps = 5000;
+
+    // K and F (Physical constant in the equation)
+    float K = 0.053;
+    float F = 0.014;
+
+    sgrid_dist_soa<3, float, aggregate<float,float,float,float>> grid(sz,domain,g,bc);
+
+    auto it = grid.getGridIterator();
+
+    while (it.isNext())
+    {
+            // Get the local grid key
+            auto key = it.get_dist();
+
+            // Old values U and V
+            grid.template insert<U>(key) = 1.0;
+            grid.template insert<V>(key) = 0.0;
+
+            // Old values U and V
+            grid.template insert<U_next>(key) = 0.0;
+            grid.template insert<V_next>(key) = 0.0;
+
+            ++it;
+    }
+
+    long int x_start = grid.size(0)*1.55f/domain.getHigh(0);
+    long int y_start = grid.size(1)*1.55f/domain.getHigh(1);
+    long int z_start = grid.size(1)*1.55f/domain.getHigh(2);
+
+    long int x_stop = grid.size(0)*1.85f/domain.getHigh(0);
+    long int y_stop = grid.size(1)*1.85f/domain.getHigh(1);
+    long int z_stop = grid.size(1)*1.85f/domain.getHigh(2);
+
+    grid_key_dx<3> start({x_start,y_start,z_start});
+    grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+    auto it_init = grid.getGridIterator(start,stop);
+
+    while (it_init.isNext())
+    {
+            auto key = it_init.get_dist();
+
+            grid.template insert<U>(key) = 0.5 + (((float)std::rand())/RAND_MAX -0.5)/10.0;
+            grid.template insert<V>(key) = 0.25 + (((float)std::rand())/RAND_MAX -0.5)/20.0;
+
+            ++it_init;
+    }
+
+    // spacing of the grid on x and y
+    float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)};
+    // sync the ghost
+    size_t count = 0;
+    grid.template ghost_get<U,V>();
+
+    // because we assume that spacing[x] == spacing[y] we use formula 2
+    // and we calculate the prefactor of Eq 2
+    float uFactor = deltaT * du/(spacing[x]*spacing[x]);
+    float vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+
+     //! \cond [stencil get and use] \endcond
+
+
+    auto func = [uFactor,vFactor,deltaT,F,K](Vc::float_v & u_out,Vc::float_v & v_out,
+    															Vc::float_v & u,Vc::float_v & v,
+                                                                cross_stencil_v<float> & us,cross_stencil_v<float> & vs,
+                                                                unsigned char * mask){
+
+																														 u_out = u + uFactor *(us.xm + us.xp +
+																																 	           us.ym + us.yp +
+																																			   us.zm + us.zp - 6.0f*u) - deltaT * u*v*v
+																																									- deltaT * F * (u - 1.0f);
+
+																														 v_out = v + vFactor *(vs.xm + vs.xp +
+																																	  	  	   vs.ym + vs.yp +
+																																			   vs.zm + vs.zp - 6.0f*v) + deltaT * u*v*v
+																																									- deltaT * (F+K) * v;
+                                                                                     };
+
+    grid.conv_cross2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+    grid.conv_cross2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func);
+
+    bool match = true;
+
+    {
+		auto it = grid.getDomainIterator();
+
+		float max_U = 0.0;
+		float max_V = 0.0;
+		grid_dist_key_dx<3> k_max;
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.get();
+
+			// plus,minus X,Y,Z
+			auto mx = Cp.move(0,-1);
+			auto px = Cp.move(0,+1);
+			auto my = Cp.move(1,-1);
+			auto py = Cp.move(1,1);
+			auto mz = Cp.move(2,-1);
+			auto pz = Cp.move(2,1);
+
+			// update based on Eq 2
+			if ( fabs(grid.get<U>(Cp) + uFactor * (
+																	grid.get<U>(mz) +
+																	grid.get<U>(pz) +
+																	grid.get<U>(my) +
+																	grid.get<U>(py) +
+																	grid.get<U>(mx) +
+																	grid.get<U>(px) -
+																	6.0*grid.get<U>(Cp)) +
+																	- deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) +
+																	- deltaT * F * (grid.get<U>(Cp) - 1.0) - grid.get<U_next>(Cp)) > 0.00001 )
+			{
+				match = false;
+				break;
+			}
+
+			// update based on Eq 2
+			if ( fabs(grid.get<V>(Cp) + vFactor * (
+																	grid.get<V>(mz) +
+																	grid.get<V>(pz) +
+																	grid.get<V>(my) +
+																	grid.get<V>(py) +
+																	grid.get<V>(mx) +
+																	grid.get<V>(px) -
+																	6*grid.get<V>(Cp)) +
+																	deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) +
+																	- deltaT * (F+K) * grid.get<V>(Cp) - grid.get<V_next>(Cp)) > 0.00001 )
+			{
+				match = false;
+				break;
+			}
+
+			++it;
+		}
+    }
+
+    BOOST_REQUIRE_EQUAL(match,true);
+}
+
+
+BOOST_AUTO_TEST_CASE (sgrid_dist_id_soa_write )
+{
+	periodicity<3> bc = {PERIODIC, PERIODIC, PERIODIC};
+
+	auto & v_cl = create_vcluster<>();
+
+	if (v_cl.size() > 16)
+	{return;}
+
+	// Domain
+	Box<3,double> domain({-0.3,-0.3,-0.3},{1.0,1.0,1.0});
+
+	// grid size
+	size_t sz[3];
+	sz[0] = 256;
+	sz[1] = 256;
+	sz[2] = 256;
+
+	// Ghost
+	Ghost<3,long int> g(1);
+
+	sgrid_dist_soa<3,double,aggregate<double,double[3]>> sg1(sz,domain,g,bc);
+	sgrid_dist_id<3,double,aggregate<double,double[3]>> sg2(sg1.getDecomposition(),sz,g);
+
+	// create a grid iterator over a bilion point
+
+	auto it = sg1.getGridIterator();
+
+	while(it.isNext())
+	{
+		auto gkey = it.get();
+		auto key = it.get_dist();
+
+		size_t sx = gkey.get(0) - 128;
+		size_t sy = gkey.get(1) - 128;
+		size_t sz = gkey.get(2) - 128;
+
+		if (sx*sx + sy*sy + sz*sz < 32*32)
+		{
+			sg1.template insert<0>(key) = 1.0;
+			sg1.template insert<1>(key)[0] = gkey.get(0);
+			sg1.template insert<1>(key)[1] = gkey.get(1);
+			sg1.template insert<1>(key)[2] = gkey.get(2);
+
+			sg2.template insert<0>(key) = 1.0;
+			sg2.template insert<1>(key)[0] = gkey.get(0);
+			sg2.template insert<1>(key)[1] = gkey.get(1);
+			sg2.template insert<1>(key)[2] = gkey.get(2);
+		}
+
+		++it;
+	}
+
+	sg1.write("sg1_test");
+	sg2.write("sg2_test");
+
+	bool test = compare("sg1_test_" + std::to_string(v_cl.rank()) + ".vtk","sg2_test_" + std::to_string(v_cl.rank()) + ".vtk");
+	BOOST_REQUIRE_EQUAL(true,test);
+
+	sg1.save("hdf5_w1_test");
+	sg2.save("hdf5_w2_test");
+
+	// To uncomment and check
+//	sgrid_dist_soa<3,double,aggregate<double,double[3]>> sg1_(sz,domain,g,bc);
+//	sgrid_dist_id<3,double,aggregate<double,double[3]>> sg2_(sg1.getDecomposition(),sz,g);
+
+//	sg1.load("hdf5_w1_test");
+//	sg2.load("hdf5_w2_test");
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/SubdomainGraphNodes.hpp b/src/SubdomainGraphNodes.hpp
index ec60446f05bad61df3b6f403da70c3ffc40e45a5..140b4a0854ea2f71b0951d9a98093e9091053d4d 100755
--- a/src/SubdomainGraphNodes.hpp
+++ b/src/SubdomainGraphNodes.hpp
@@ -3,7 +3,7 @@
 
 #include <boost/fusion/container/vector.hpp>
 #include <boost/fusion/include/at_c.hpp>
-#include "Grid/Encap.hpp"
+#include "memory_ly/Encap.hpp"
 
 /* In a decomposition graph each node represent a sub-domain while an edge represent
  * an interaction between sub-domain (it mean that they have to communicate).
diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
index cc1175f8ca48a9e905eceb5fe30e8bdfabb985bf..b1322c099d0fddd6dad79cbdd2a6394877a47efb 100644
--- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
@@ -28,22 +28,19 @@ struct labelParticlesGhost_impl
 					Decomposition & dec,
 					openfpm::vector<aggregate<unsigned int,unsigned long int>,
 							CudaMemory,
-							typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
 							memory_traits_inte> & g_opart_device,
 				    openfpm::vector<aggregate<unsigned int>,
 				                            Memory,
-				                            typename layout_base<aggregate<unsigned int>>::type,
 				                            layout_base> & proc_id_out,
 				    openfpm::vector<aggregate<unsigned int>,
 				                             Memory,
-				                             typename layout_base<aggregate<unsigned int>>::type,
 				                             layout_base> & starts,
 		            Vcluster<Memory> & v_cl,
-					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+            		openfpm::vector<prop,Memory,layout_base> & v_prp,
             		openfpm::vector<size_t> & prc,
             		openfpm::vector<size_t> & prc_sz,
-            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
+            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
             		size_t & g_m,
             		size_t opt)
 	{
@@ -60,22 +57,19 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
 					Decomposition & dec,
 					openfpm::vector<aggregate<unsigned int,unsigned long int>,
 							CudaMemory,
-							typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
 							memory_traits_inte> & g_opart_device,
 				    openfpm::vector<aggregate<unsigned int>,
 				                            Memory,
-				                            typename layout_base<aggregate<unsigned int>>::type,
 				                            layout_base> & proc_id_out,
 				    openfpm::vector<aggregate<unsigned int>,
 				                             Memory,
-				                             typename layout_base<aggregate<unsigned int>>::type,
 				                             layout_base> & starts,
 					Vcluster<Memory> & v_cl,
-					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+            		openfpm::vector<prop,Memory,layout_base> & v_prp,
             		openfpm::vector<size_t> & prc,
             		openfpm::vector<size_t> & prc_sz,
-            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
+            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
             		size_t & g_m,
             		size_t opt)
 	{
@@ -118,7 +112,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
 			dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
 
 			// sort particles
-			mergesort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
+			openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
 
 			mem.allocate(sizeof(int));
 			mem.fill(0);
@@ -153,7 +147,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
 			prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
 
 			// Here we reorder the offsets in ascending order
-			mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
+			openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
 
 			prc_offset.template deviceToHost<0,1>();
 
@@ -185,10 +179,10 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
 template<bool with_pos,unsigned int dim, typename St,  typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
 struct local_ghost_from_opart_impl
 {
-	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
-					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
-					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
+					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
+					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+            		openfpm::vector<prop,Memory,layout_base> & v_prp,
             		size_t opt)
 	{
 		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
@@ -198,10 +192,10 @@ struct local_ghost_from_opart_impl
 template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
 struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
 {
-	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
-					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
-					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
+					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
+					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+            		openfpm::vector<prop,Memory,layout_base> & v_prp,
             		size_t opt)
 	{
 #if defined(CUDA_GPU) && defined(__NVCC__)
@@ -234,14 +228,14 @@ struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
 template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
 struct local_ghost_from_dec_impl
 {
-	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
-					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
-					openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
-					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
+	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
+					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
+					openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
+					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
 					Vcluster<Memory> & v_cl,
-					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
-					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
+					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+            		openfpm::vector<prop,Memory,layout_base> & v_prp,
             		size_t & g_m,
             		size_t opt)
 	{
@@ -253,14 +247,14 @@ struct local_ghost_from_dec_impl
 template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
 struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
 {
-	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
-					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
-					openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
-					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
+	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc,
+					const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts,
+					openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev,
+					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv,
 					Vcluster<Memory> & v_cl,
-					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
-					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+					openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts,
+					openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+            		openfpm::vector<prop,Memory,layout_base> & v_prp,
             		size_t & g_m,
             		size_t opt)
 	{
diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu
index fc9c28eea927344c47d5c289823a429e10f159ab..b0f330f6aa6c4bde9204b051ee6984fa8a5017e7 100644
--- a/src/Vector/cuda/vector_dist_cuda_func_test.cu
+++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu
@@ -8,8 +8,7 @@
 #include "Vector/cuda/vector_dist_cuda_funcs.cuh"
 #include "Vector/util/vector_dist_funcs.hpp"
 #include "Decomposition/CartDecomposition.hpp"
-#include "util/cuda/scan_cuda.cuh"
-#include "util/cuda/moderngpu/kernel_scan.hxx"
+//#include "util/cuda/scan_cuda.cuh"
 #include "Vector/vector_dist.hpp"
 #include "util/cuda/scan_ofp.cuh"
 
@@ -31,9 +30,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 
 	for (size_t i = 0 ; i < v_prp.size() ; i++)
 	{
-		v_pos.template get<0>(i)[0] = (float)rand()/RAND_MAX;
-		v_pos.template get<0>(i)[1] = (float)rand()/RAND_MAX;
-		v_pos.template get<0>(i)[2] = (float)rand()/RAND_MAX;
+		v_pos.template get<0>(i)[0] = (float)rand()/(float)RAND_MAX;
+		v_pos.template get<0>(i)[1] = (float)rand()/(float)RAND_MAX;
+		v_pos.template get<0>(i)[2] = (float)rand()/(float)RAND_MAX;
 
 		v_prp.template get<0>(i) = i+12345;
 
@@ -104,9 +103,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 	v_prp.hostToDevice<0,1,2>();
 
 	// label particle processor
-	num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size());
+	CUDA_LAUNCH_DIM3((num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>),
+	ite.wthr,ite.thr,
+	box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size());
 
 	o_part_loc.deviceToHost<0>();
 
@@ -137,7 +136,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 	starts.deviceToHost<0>(starts.size()-1,starts.size()-1);
 	size_t tot = starts.template get<0>(o_part_loc.size()-1);
 
-	openfpm::vector<Point<3,float>,CudaMemory,typename memory_traits_inte<Point<3,float>>::type,memory_traits_inte> shifts;
+	openfpm::vector<Point<3,float>,CudaMemory,memory_traits_inte> shifts;
 
 	shifts.resize(4);
 
@@ -165,12 +164,12 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 	openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> o_part_loc2;
 	o_part_loc2.resize(tot);
 
-	shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
+	CUDA_LAUNCH_DIM3((shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
 			                     decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
 			                     decltype(starts.toKernel()),decltype(shifts.toKernel()),
-			                     decltype(o_part_loc2.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(box_f_dev.toKernel(),box_f_sv.toKernel(),
+			                     decltype(o_part_loc2.toKernel())>),
+	ite.wthr,ite.thr,
+	box_f_dev.toKernel(),box_f_sv.toKernel(),
 	 v_pos.toKernel(),v_prp.toKernel(),
 	 starts.toKernel(),shifts.toKernel(),o_part_loc2.toKernel(),old,old);
 
@@ -356,9 +355,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 
 	ite = o_part_loc2.getGPUIterator();
 
-	process_ghost_particles_local<true,3,decltype(o_part_loc2.toKernel()),decltype(v_pos2.toKernel()),decltype(v_prp2.toKernel()),decltype(shifts.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(o_part_loc2.toKernel(),v_pos2.toKernel(),v_prp2.toKernel(),shifts.toKernel(),old);
+	CUDA_LAUNCH_DIM3((process_ghost_particles_local<true,3,decltype(o_part_loc2.toKernel()),decltype(v_pos2.toKernel()),decltype(v_prp2.toKernel()),decltype(shifts.toKernel())>),
+	ite.wthr,ite.thr,
+	o_part_loc2.toKernel(),v_pos2.toKernel(),v_prp2.toKernel(),shifts.toKernel(),old);
 
 	v_pos2.template deviceToHost<0>();
 	v_prp2.template deviceToHost<0,1,2>();
@@ -397,7 +396,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_fill_send_buffer_test )
 	typedef object<typename object_creator<typename prop::type, 0,1,2>::type> prp_object;
 
 	// send vector for each processor
-	typedef openfpm::vector<prp_object,CudaMemory,typename memory_traits_inte<prp_object>::type,memory_traits_inte> send_vector;
+	typedef openfpm::vector<prp_object,CudaMemory,memory_traits_inte> send_vector;
 
 	openfpm::vector<send_vector> g_send_prp;
 
@@ -462,9 +461,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_fill_send_buffer_test )
 
 		auto ite = g_send_prp.get(i).getGPUIterator();
 
-		process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),0,1,2>
-		<<<ite.wthr,ite.thr>>>
-		(g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
+		CUDA_LAUNCH_DIM3((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),0,1,2>),
+		ite.wthr,ite.thr,
+		g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
 		 v_prp.toKernel(),offset);
 
 		offset += g_send_prp.get(i).size();
@@ -561,9 +560,9 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
 
 		for (size_t j = 0 ; j < n_part ; j++)
 		{
-			vg.template get<0>(k*n_part+j)[0] = (sp.getHigh(0) - sp.getLow(0))*((float)rand()/RAND_MAX) + sp.getLow(0);
-			vg.template get<0>(k*n_part+j)[1] = (sp.getHigh(1) - sp.getLow(1))*((float)rand()/RAND_MAX) + sp.getLow(1);
-			vg.template get<0>(k*n_part+j)[2] = (sp.getHigh(2) - sp.getLow(2))*((float)rand()/RAND_MAX) + sp.getLow(2);
+			vg.template get<0>(k*n_part+j)[0] = (sp.getHigh(0) - sp.getLow(0))*((float)rand()/(float)RAND_MAX) + sp.getLow(0);
+			vg.template get<0>(k*n_part+j)[1] = (sp.getHigh(1) - sp.getLow(1))*((float)rand()/(float)RAND_MAX) + sp.getLow(1);
+			vg.template get<0>(k*n_part+j)[2] = (sp.getHigh(2) - sp.getLow(2))*((float)rand()/(float)RAND_MAX) + sp.getLow(2);
 		}
 	}
 
@@ -578,9 +577,9 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
 	proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
 	proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);
 
-	num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel());
+	CUDA_LAUNCH_DIM3((num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>),
+	ite.wthr,ite.thr,
+	dec.toKernel(),vg.toKernel(),proc_id_out.toKernel());
 
 	proc_id_out.deviceToHost<0>();
 
@@ -603,7 +602,6 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
 
     openfpm::vector<aggregate<unsigned int>,
                     CudaMemory,
-                    typename memory_traits_inte<aggregate<unsigned int>>::type,
                     memory_traits_inte> starts;
 
     starts.resize(proc_id_out.size());
@@ -618,7 +616,6 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
 
     openfpm::vector<aggregate<unsigned int,long unsigned int>,
                     CudaMemory,
-                    typename memory_traits_inte<aggregate<unsigned int,long unsigned int>>::type,
                     memory_traits_inte> output;
 
     output.resize(sz);
@@ -626,9 +623,9 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
 	ite = vg.getGPUIterator();
 
 	// we compute processor id for each particle
-	proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel());
+	CUDA_LAUNCH_DIM3((proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())>),
+	ite.wthr,ite.thr,
+	dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel());
 
 	output.template deviceToHost<0,1>();
 
@@ -724,9 +721,9 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
 
 	for (size_t i = 0 ; i < 10000 ; i++)
 	{
-		vg.template get<0>(i)[0] = (float)rand()/RAND_MAX;
-		vg.template get<0>(i)[1] = (float)rand()/RAND_MAX;
-		vg.template get<0>(i)[2] = (float)rand()/RAND_MAX;
+		vg.template get<0>(i)[0] = (float)rand()/(float)RAND_MAX;
+		vg.template get<0>(i)[1] = (float)rand()/(float)RAND_MAX;
+		vg.template get<0>(i)[2] = (float)rand()/(float)RAND_MAX;
 	}
 
 	vg.hostToDevice<0>();
@@ -739,14 +736,14 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
 	proc_id_out.resize(vg.size());
 
 	openfpm::vector_gpu<aggregate<int,int,int>> dev_counter;
-	dev_counter.resize(10);
+	dev_counter.resize(v_cl.size());
 	dev_counter.fill<0>(0);
 	dev_counter.fill<1>(0);
 	dev_counter.fill<2>(0);
 
-	process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
+	CUDA_LAUNCH_DIM3((process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>),
+	ite.wthr,ite.thr,
+	dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
 
 
 	proc_id_out.deviceToHost<0>();
@@ -834,7 +831,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_reorder_lbl)
 
 	auto ite = lbl_p.getGPUIterator();
 
-	reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())><<<ite.wthr,ite.thr>>>(lbl_p.toKernel(),starts.toKernel());
+	CUDA_LAUNCH_DIM3((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite.wthr,ite.thr,lbl_p.toKernel(),starts.toKernel());
 
 	starts.template deviceToHost<0>();
 	lbl_p.template deviceToHost<0,1,2>();
@@ -909,7 +906,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 
 	auto ite = v_pos.getGPUIterator();
 
-	merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
+	CUDA_LAUNCH_DIM3((merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0>),ite.wthr,ite.thr,v_pos.toKernel(),v_prp.toKernel(),
 																								 v_pos_out.toKernel(),v_prp_out.toKernel(),
 																								 ns_to_s.toKernel());
 
@@ -933,7 +930,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 
 	BOOST_REQUIRE_EQUAL(match,true);
 
-	merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
+	CUDA_LAUNCH_DIM3((merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2>),ite.wthr,ite.thr,v_pos.toKernel(),v_prp.toKernel(),
 																								 v_pos_out.toKernel(),v_prp_out.toKernel(),
 																								 ns_to_s.toKernel());
 
@@ -961,7 +958,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
 
 	BOOST_REQUIRE_EQUAL(match,true);
 
-	merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
+	CUDA_LAUNCH_DIM3((merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())>),ite.wthr,ite.thr,v_pos.toKernel(),v_prp.toKernel(),
 																								 v_pos_out.toKernel(),v_prp_out.toKernel(),
 																								 ns_to_s.toKernel());
 
@@ -997,8 +994,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
 {
 	openfpm::vector_gpu<aggregate<int,int>> m_opart;
 
-    openfpm::vector<openfpm::vector<Point<3,float>,CudaMemory,typename memory_traits_inte<Point<3,float>>::type,memory_traits_inte,openfpm::grow_policy_identity>> m_pos;
-    openfpm::vector<openfpm::vector<aggregate<float,float[2],float[3][3]>,CudaMemory,typename memory_traits_inte<aggregate<float,float[2],float[3][3]>>::type,memory_traits_inte,openfpm::grow_policy_identity>> m_prp;
+    openfpm::vector<openfpm::vector<Point<3,float>,CudaMemory,memory_traits_inte,openfpm::grow_policy_identity>> m_pos;
+    openfpm::vector<openfpm::vector<aggregate<float,float[2],float[3][3]>,CudaMemory,memory_traits_inte,openfpm::grow_policy_identity>> m_prp;
 
     openfpm::vector_gpu<Point<3,float>> v_pos;
     openfpm::vector_gpu<aggregate<float,float[2],float[3][3]>> v_prp;
@@ -1011,22 +1008,22 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
 
     for (size_t i = 0 ; i < v_pos.size() ; i++)
     {
-    	v_pos.template get<0>(i)[0] = (float)rand()/RAND_MAX;
-    	v_pos.template get<0>(i)[1] = (float)rand()/RAND_MAX;
-    	v_pos.template get<0>(i)[2] = (float)rand()/RAND_MAX;
-
-    	v_prp.template get<0>(i) = 5.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<1>(i)[0] = 10.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<1>(i)[1] = 11.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[0][0] = 40.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[0][1] = 50.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[0][2] = 60.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[1][0] = 70.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[1][1] = 80.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[1][2] = 150.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[2][0] = 160.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[2][1] = 170.0 + (float)rand()/RAND_MAX;
-    	v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/RAND_MAX;
+    	v_pos.template get<0>(i)[0] = (float)rand()/(float)RAND_MAX;
+    	v_pos.template get<0>(i)[1] = (float)rand()/(float)RAND_MAX;
+    	v_pos.template get<0>(i)[2] = (float)rand()/(float)RAND_MAX;
+
+    	v_prp.template get<0>(i) = 5.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<1>(i)[0] = 10.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<1>(i)[1] = 11.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[0][0] = 40.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[0][1] = 50.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[0][2] = 60.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[1][0] = 70.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[1][1] = 80.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[1][2] = 150.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[2][0] = 160.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[2][1] = 170.0 + (float)rand()/(float)RAND_MAX;
+    	v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/(float)RAND_MAX;
 
     	int seg = i / 10000;
     	m_opart.template get<1>(i) = seg;
@@ -1051,10 +1048,10 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
     {
     	auto ite = m_pos.get(i).getGPUIterator();
 
-		process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
-																		   decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
-						<<<ite.wthr,ite.thr>>>
-						(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
+		CUDA_LAUNCH_DIM3((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
+																		   decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
+						ite.wthr,ite.thr,
+						m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
 											v_pos.toKernel(),v_prp.toKernel(),offset);
 
 		m_pos.get(i).deviceToHost<0>();
@@ -1116,9 +1113,9 @@ void vector_dist_remove_marked_type()
 	{
 		auto p = it.get();
 
-		vd.getPos(p)[0] = (float)rand() / RAND_MAX;
-		vd.getPos(p)[1] = (float)rand() / RAND_MAX;
-		vd.getPos(p)[2] = (float)rand() / RAND_MAX;
+		vd.getPos(p)[0] = (float)rand() / (float)RAND_MAX;
+		vd.getPos(p)[1] = (float)rand() / (float)RAND_MAX;
+		vd.getPos(p)[2] = (float)rand() / (float)RAND_MAX;
 
 		++it;
 	}
@@ -1272,7 +1269,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration_gpu )
 	// Distributed vector
 	vector_dist_gpu<3,float,part_prop> vd(k,box,bc,ghost,BIND_DEC_TO_GHOST);
 
-	size_t start = vd.init_size_accum(k);
+/*	size_t start = vd.init_size_accum(k);
 
 	auto it = vd.getIterator();
 
@@ -1504,7 +1501,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration_gpu )
 		++p_it3;
 	}
 
-	BOOST_REQUIRE_EQUAL(ret,true);
+	BOOST_REQUIRE_EQUAL(ret,true);*/
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index c5a19ccc3daa580a5d0093b30281d6acbde71ca1..78c4c9c5c72a508c9d2c6716643dc714c3e97e5a 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -9,12 +9,11 @@
 #define VECTOR_DIST_CUDA_FUNCS_CUH_
 
 #include "Vector/util/vector_dist_funcs.hpp"
-#include "util/cuda/moderngpu/kernel_reduce.hxx"
-#include "util/cuda/moderngpu/kernel_scan.hxx"
 #include "Decomposition/common.hpp"
 #include "lib/pdata.hpp"
 #include "util/cuda/kernels.cuh"
 #include "util/cuda/scan_ofp.cuh"
+#include "util/cuda/reduce_ofp.cuh"
 #include "memory/CudaMemory.cuh"
 
 template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
@@ -130,6 +129,7 @@ __global__  void process_ghost_particles_prp(vector_g_opart_type g_opart, vector
     process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp);
 }
 
+
 template<typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp>
 __global__  void process_ghost_particles_prp_put(vector_prp_type_out m_prp,
 		     	 	 	 	 	 	   	     vector_prp_type_in  v_prp, unsigned int offset)
@@ -291,7 +291,7 @@ auto reduce_local(vector_type & vd) -> typename std::remove_reference<decltype(v
 	CudaMemory mem;
 	mem.allocate(sizeof(reduce_type));
 
-	mgpu::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(),
+	openfpm::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(),
 			            vd.size_local(), (reduce_type *)mem.getDevicePointer() ,
 			            op<reduce_type>(), vd.getVC().getmgpuContext());
 
diff --git a/src/Vector/cuda/vector_dist_gpu_MP_tests.cu b/src/Vector/cuda/vector_dist_gpu_MP_tests.cu
index 8d9f7d825cb7de6d47e25967dd6555995f0596dd..e3bcca4cbcddc3f77d2ffe87dc80be53bc9bd464 100644
--- a/src/Vector/cuda/vector_dist_gpu_MP_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_MP_tests.cu
@@ -523,25 +523,25 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test )
 		phases.template get<0>(2).add();
 		phases.template get<0>(3).add();
 
-		phases.template get<0>(0).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.template get<0>(0).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.template get<0>(0).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.template get<0>(0).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.template get<0>(1).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.template get<0>(1).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.template get<0>(1).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.template get<0>(1).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.template get<0>(2).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.template get<0>(2).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.template get<0>(2).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.template get<0>(2).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.template get<0>(3).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.template get<0>(3).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.template get<0>(3).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.template get<0>(3).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
+		phases.template get<0>(0).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(0).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(0).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(0).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.template get<0>(1).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(1).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(1).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(1).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.template get<0>(2).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(2).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(2).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(2).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.template get<0>(3).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(3).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(3).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.template get<0>(3).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
 	}
 
 	phases.template get<0>(0).hostToDevicePos();
@@ -559,7 +559,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test )
 	openfpm::vector_gpu<aggregate<float>> output;
 	output.resize(100 * phases.size());
 
-	vdmkt<<<1,1>>>(phases.toKernel(),output.toKernel());
+	CUDA_LAUNCH_DIM3(vdmkt,1,1,phases.toKernel(),output.toKernel());
 
 	output.template deviceToHost<0>();
 
@@ -615,25 +615,25 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test_simplified )
 		phases.get(2).add();
 		phases.get(3).add();
 
-		phases.get(0).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(0).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(0).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(0).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.get(1).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(1).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(1).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(1).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.get(2).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(2).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(2).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(2).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.get(3).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(3).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(3).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(3).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
+		phases.get(0).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(0).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(0).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(0).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.get(1).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(1).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(1).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(1).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.get(2).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(2).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(2).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(2).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.get(3).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(3).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(3).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(3).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
 	}
 
 	phases.get(0).hostToDevicePos();
@@ -651,7 +651,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test_simplified )
 	openfpm::vector_gpu<aggregate<float>> output;
 	output.resize(100 * phases.size());
 
-	vdmkt_simple<<<1,1>>>(phases.toKernel(),output.toKernel());
+	CUDA_LAUNCH_DIM3(vdmkt_simple,1,1,phases.toKernel(),output.toKernel());
 
 	output.template deviceToHost<0>();
 
@@ -706,25 +706,25 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_cl_test )
 		phases.get(2).add();
 		phases.get(3).add();
 
-		phases.get(0).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(0).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(0).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(0).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.get(1).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(1).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(1).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(1).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.get(2).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(2).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(2).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(2).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
-
-		phases.get(3).getLastPosWrite()[0] = (float)rand() / RAND_MAX;
-		phases.get(3).getLastPosWrite()[1] = (float)rand() / RAND_MAX;
-		phases.get(3).getLastPosWrite()[2] = (float)rand() / RAND_MAX;
-		phases.get(3).getLastPropWrite<0>() = (float)rand() / RAND_MAX;
+		phases.get(0).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(0).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(0).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(0).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.get(1).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(1).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(1).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(1).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.get(2).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(2).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(2).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(2).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
+
+		phases.get(3).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX;
+		phases.get(3).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX;
+		phases.get(3).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX;
+		phases.get(3).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX;
 	}
 
 	// redistribute all
@@ -760,7 +760,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_cl_test )
 	output.resize(tot);
 	output2.resize(tot_g);
 
-	vdmkt_simple_cl<<<1,1>>>(phases.toKernel(),output.toKernel(),cl_ph.toKernel(),output2.toKernel());
+	CUDA_LAUNCH_DIM3(vdmkt_simple_cl,1,1,phases.toKernel(),output.toKernel(),cl_ph.toKernel(),output2.toKernel());
 
 	output.template deviceToHost<0>();
 
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index b70bd00f9be64da890347bf1ecd91379bd1e134d..c8fcec4e295126ecc66e0f7c06a8fe537e68b3ca 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -224,9 +224,9 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_ghost_get )
 	{
 		auto p = it.get();
 
-		vd.getPos(p)[0] = (float)rand() / RAND_MAX;
-		vd.getPos(p)[1] = (float)rand() / RAND_MAX;
-		vd.getPos(p)[2] = (float)rand() / RAND_MAX;
+		vd.getPos(p)[0] = (float)rand() / (float)RAND_MAX;
+		vd.getPos(p)[1] = (float)rand() / (float)RAND_MAX;
+		vd.getPos(p)[2] = (float)rand() / (float)RAND_MAX;
 
 		vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
 
@@ -349,9 +349,9 @@ void vector_dist_gpu_test_impl()
 		int y = rand();
 		int z = rand();
 
-		vd.getPos(p)[0] = (float)x / RAND_MAX;
-		vd.getPos(p)[1] = (float)y / RAND_MAX;
-		vd.getPos(p)[2] = (float)z / RAND_MAX;
+		vd.getPos(p)[0] = (float)x / (float)RAND_MAX;
+		vd.getPos(p)[1] = (float)y / (float)RAND_MAX;
+		vd.getPos(p)[2] = (float)z / (float)RAND_MAX;
 
 		Point<3,float> xp = vd.getPos(p);
 
@@ -396,7 +396,7 @@ void vector_dist_gpu_test_impl()
 	// offload to device
 	vd.hostToDevicePos();
 
-	initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel());
+	CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel());
 
 	// now we check what we initialized
 
@@ -468,9 +468,9 @@ void vector_dist_gpu_make_sort_test_impl()
 		int y = rand();
 		int z = rand();
 
-		vd.getPos(p)[0] = (float)x / RAND_MAX;
-		vd.getPos(p)[1] = (float)y / RAND_MAX;
-		vd.getPos(p)[2] = (float)z / RAND_MAX;
+		vd.getPos(p)[0] = (float)x / (float)RAND_MAX;
+		vd.getPos(p)[1] = (float)y / (float)RAND_MAX;
+		vd.getPos(p)[2] = (float)z / (float)RAND_MAX;
 
 		++it;
 	}
@@ -482,7 +482,7 @@ void vector_dist_gpu_make_sort_test_impl()
 
 	auto it3 = vd.getDomainIteratorGPU();
 
-	initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel());
+	CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel());
 
 	// Here we check make sort does not mess-up particles we use a Cell-List to check that
 	// the two cell-list constructed are identical
@@ -603,9 +603,9 @@ void vdist_calc_gpu_test()
 	{
 		auto p = it.get();
 
-		vd.getPos(p)[0] = (St)rand() / RAND_MAX;
-		vd.getPos(p)[1] = (St)rand() / RAND_MAX;
-		vd.getPos(p)[2] = (St)rand() / RAND_MAX;
+		vd.getPos(p)[0] = (St)rand() / (float)RAND_MAX;
+		vd.getPos(p)[1] = (St)rand() / (float)RAND_MAX;
+		vd.getPos(p)[2] = (St)rand() / (float)RAND_MAX;
 
 		vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
 
@@ -708,8 +708,6 @@ void vdist_calc_gpu_test()
 	{
 		vd.map(RUN_ON_DEVICE);
 
-		CUDA_SAFE(cudaGetLastError());
-
 		vd.deviceToHostPos();
 		vd.template deviceToHostProp<0,1,2>();
 
@@ -854,7 +852,7 @@ void vdist_calc_gpu_test()
 		// move particles on gpu
 
 		auto ite = vd.getDomainIteratorGPU();
-		move_parts_gpu_test<3,decltype(vd.toKernel())><<<ite.wthr,ite.thr>>>(vd.toKernel());
+		CUDA_LAUNCH_DIM3((move_parts_gpu_test<3,decltype(vd.toKernel())>),ite.wthr,ite.thr,vd.toKernel());
 	}
 }
 
@@ -933,14 +931,14 @@ void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut)
 {
 	std::random_device r;
 
-    std::seed_seq seed2{r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank(),
-    					r() + create_vcluster().rank()};
+    std::seed_seq seed2{/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank(),
+    					/*r() +*/ create_vcluster().rank()};
     std::mt19937 e2(seed2);
 
 	typedef vector_dist_gpu<3,double,aggregate<double,double[3],double[3]>> vector_type;
@@ -1055,6 +1053,7 @@ void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut)
 		vd.hostToDevicePos();
 		vd.map(RUN_ON_DEVICE);
 		vd.template ghost_get<0>(RUN_ON_DEVICE);
+
 		vd.deviceToHostPos();
 		vd.template deviceToHostProp<0,1,2>();
 
@@ -1102,7 +1101,11 @@ void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut)
 			double load_f = load;
 			double load_fc = loads.get(i);
 
+#ifdef ENABLE_ASAN
+			BOOST_REQUIRE_CLOSE(load_f,load_fc,30.0);
+#else
 			BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
+#endif
 		}
 	}
 }
@@ -1311,7 +1314,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda2)
 	if (create_vcluster().size() <= 3)
 	{return;};
 
+	#ifndef CUDA_ON_CPU
 	vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(1000000,0.01);
+	#endif
 }
 
 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3)
@@ -1319,7 +1324,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3)
 	if (create_vcluster().size() < 8)
 	{return;}
 
+	#ifndef CUDA_ON_CPU
 	vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(15000000,0.005);
+	#endif
 }
 
 
@@ -1847,6 +1854,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1)
 }
 
 
+
 BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 {
 	Vcluster<> & v_cl = create_vcluster();
@@ -1878,7 +1886,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 		// ghost
 		Ghost<3,float> ghost(r_g);
 
-		typedef  aggregate<float> part_prop;
+		typedef  aggregate<float,float,float> part_prop;
 
 		// Distributed vector
 		vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost);
@@ -1899,16 +1907,18 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 
 			vd.getLastPropWrite<0>() = 0.0;
 
+			vd.getLastPropWrite<2>() = 0.0;
+
 			++it;
 		}
 
 		vd.map();
 
 		vd.hostToDevicePos();
-		vd.template hostToDeviceProp<0>();
+		vd.template hostToDeviceProp<0,2>();
 		// sync the ghost
-		vd.ghost_get<0>(RUN_ON_DEVICE);
-		vd.template deviceToHostProp<0>();
+		vd.ghost_get<0,2>(RUN_ON_DEVICE);
+		vd.template deviceToHostProp<0,2>();
 		vd.deviceToHostPos();
 
 		{
@@ -1937,7 +1947,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 					float dist = xp.distance(xq);
 
 					if (dist < r_cut)
+					{
 						vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
+						vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut) / 2;
+					}
 
 					++Np;
 				}
@@ -1946,25 +1959,27 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 			}
 
 			vd.hostToDevicePos();
-			vd.template hostToDeviceProp<0>();
-			vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
-			vd.template deviceToHostProp<0>();
+			vd.template hostToDeviceProp<0,2>();
+			vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE);
+			vd.template deviceToHostProp<0,2>();
 			vd.deviceToHostPos();
 
 			bool ret = true;
 			auto it3 = vd.getDomainIterator();
 
 			float constant = vd.getProp<0>(it3.get());
+			float constanta = vd.getProp<2>(it3.get());
 			float eps = 0.001;
 
 			while (it3.isNext())
 			{
 				float constant2 = vd.getProp<0>(it3.get());
-				if (fabs(constant - constant2)/constant > eps)
+				float constant3 = vd.getProp<2>(it3.get());
+				if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
 				{
 					Point<3,float> p = vd.getPosRead(it3.get());
 
-					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "    " << v_cl.getProcessUnitID() << std::endl;
+					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "/" << constant3 << "    " << v_cl.getProcessUnitID() << std::endl;
 					ret = false;
 					break;
 				}
@@ -1980,6 +1995,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 			auto key = itp.get();
 
 			vd.getPropWrite<0>(key) = 0.0;
+			vd.getPropWrite<2>(key) = 0.0;
 
 			++itp;
 		}
@@ -2010,7 +2026,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 					float dist = xp.distance(xq);
 
 					if (dist < r_cut)
+					{
 						vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
+						vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
+					}
 
 					++Np;
 				}
@@ -2019,25 +2038,28 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
 			}
 
 			vd.hostToDevicePos();
-			vd.template hostToDeviceProp<0>();
+			vd.template hostToDeviceProp<0,2>();
 			vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
-			vd.template deviceToHostProp<0>();
+			vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE);
+			vd.template deviceToHostProp<0,2>();
 			vd.deviceToHostPos();
 
 			bool ret = true;
 			auto it3 = vd.getDomainIterator();
 
 			float constant = vd.getPropRead<0>(it3.get());
+			float constanta = vd.getPropRead<2>(it3.get());
 			float eps = 0.001;
 
 			while (it3.isNext())
 			{
 				float constant2 = vd.getPropRead<0>(it3.get());
-				if (fabs(constant - constant2)/constant > eps)
+				float constant3 = vd.getPropRead<0>(it3.get());
+				if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
 				{
 					Point<3,float> p = vd.getPosRead(it3.get());
 
-					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "    " << v_cl.getProcessUnitID() << std::endl;
+					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "/" << constant3 << "    " << v_cl.getProcessUnitID() << std::endl;
 					ret = false;
 					break;
 				}
diff --git a/src/Vector/tests/vector_dist_cell_list_tests.cpp b/src/Vector/tests/vector_dist_cell_list_tests.cpp
index 028f5beeb55973533a3b72d4d331abedeb1c1893..b6c2e7789fd55f0623e1c44139ebd505ee76845c 100644
--- a/src/Vector/tests/vector_dist_cell_list_tests.cpp
+++ b/src/Vector/tests/vector_dist_cell_list_tests.cpp
@@ -607,8 +607,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_cell_list )
 	vd.ghost_put<add_,1>();
 	vd.ghost_put<merge_,4>();
 
-	vd.write("DEBUG");
-
 	auto p_it3 = vd.getDomainIterator();
 
 	bool ret = true;
diff --git a/src/Vector/tests/vector_dist_dlb_test.hpp b/src/Vector/tests/vector_dist_dlb_test.hpp
index 97b4dcee0f2ff3a773d1d4b2bc114bdb3edcc0b7..df936a3d164744edb7645b46bd27b4cdb60b6b34 100644
--- a/src/Vector/tests/vector_dist_dlb_test.hpp
+++ b/src/Vector/tests/vector_dist_dlb_test.hpp
@@ -251,7 +251,6 @@ template<typename vector_type> void test_dlb_vector()
 	}
 
 	vd.map();
-
 	vd.template ghost_get<>();
 
 	// Get the neighborhood of each particles
@@ -411,7 +410,7 @@ template<typename vector_type> void test_dlb_multi_phase_v_vector()
 	mp_test_template(vd0,vd1,vd2,vd3);
 }
 
-BOOST_AUTO_TEST_CASE( vector_dist_dlb_test_part )
+BOOST_AUTO_TEST_CASE( vector_dist_dlb )
 {
 	test_dlb_vector<vector_dist<3,double,aggregate<double>>>();
 }
diff --git a/src/Vector/util/vector_dist_funcs.hpp b/src/Vector/util/vector_dist_funcs.hpp
index 35f001aeb675eb1a715f9bb39ba4501c96c0d496..6d86d44779dbd1c6d14900cd05f03d0f9f5da590 100644
--- a/src/Vector/util/vector_dist_funcs.hpp
+++ b/src/Vector/util/vector_dist_funcs.hpp
@@ -116,6 +116,7 @@ __device__ inline void process_map_device_particle(unsigned int i, unsigned int
 	proc_class::proc(i,id,v_prp,m_prp);
 }
 
+
 //! It process one particle
 template<typename Top, typename T2, typename T4, unsigned int ... prp>
 __device__ inline void process_ghost_device_particle_prp(unsigned int i, unsigned int offset, Top & g_opart, T2 & m_prp, T4 & v_prp)
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index df61e151a2a128d4e4d1b69629968b73dd859935..66ac010b09e1039abcd0c19974cb23f9c1c93a05 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -9,6 +9,7 @@
 #define VECTOR_HPP_
 
 #include "config.h"
+#include "util/cuda_launch.hpp"
 #include "HDF5_wr/HDF5_wr.hpp"
 #include "VCluster/VCluster.hpp"
 #include "Space/Shape/Point.hpp"
@@ -280,17 +281,17 @@ private:
 
 	//! Particle position vector, (It has 2 elements) the first has real particles assigned to a processor
 	//! the second element contain unassigned particles
-	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos;
+	openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos;
 
 	//! Particle properties vector, (It has 2 elements) the first has real particles assigned to a processor
 	//! the second element contain unassigned particles
-	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp;
+	openfpm::vector<prop,Memory,layout_base> v_prp;
 
 	//! reordered v_pos buffer
-	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp_out;
+	openfpm::vector<prop,Memory,layout_base> v_prp_out;
 
 	//! reordered v_prp buffer
-	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos_out;
+	openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_out;
 
 	//! option used to create this vector
 	size_t opt = 0;
@@ -550,6 +551,35 @@ public:
 #endif
 	}
 
+	/*! \brief Constructor of a distributed vector
+	 *
+	 * \param np number of elements
+	 * \param box domain where the vector of elements live
+	 * \param bc boundary conditions
+	 * \param g Ghost margins
+	 * \param opt [Optional] additional options. BIND_DEC_TO_GHOST Bind the decomposition to be multiple of the
+	 *          ghost size. This is required if we want to use symmetric to eliminate
+	 *          ghost communications.
+	 * \param gdist [Optional] override the default distribution grid
+	 *
+	 */
+	vector_dist(size_t np, Box<dim, St> box, const size_t (&bc)[dim], const Ghost<dim, St> & g, const grid_sm<dim,void> & gdist)
+	:opt(0) SE_CLASS3_VDIST_CONSTRUCTOR
+	{
+		if (opt >> 32 != 0)
+		{this->setDecompositionGranularity(opt >> 32);}
+
+		check_parameters(box);
+
+		init_structures(np);
+
+		this->init_decomposition_gr_cell(box,bc,g,opt,gdist);
+
+
+#ifdef SE_CLASS3
+		se3.Initialize();
+#endif
+	}
 
 	/*! \brief Constructor of a distributed vector
 	 *
@@ -2074,7 +2104,7 @@ public:
 	 * \return an iterator
 	 *
 	 */
-	ite_gpu<1> getDomainIteratorGPU(size_t n_thr = 1024) const
+	ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
 	{
 #ifdef SE_CLASS3
 		se3.getIterator();
@@ -2088,7 +2118,7 @@ public:
 	 * \return an iterator
 	 *
 	 */
-	ite_gpu<1> getDomainAndGhostIteratorGPU(size_t n_thr = 1024) const
+	ite_gpu<1> getDomainAndGhostIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
 	{
 #ifdef SE_CLASS3
 		se3.getIterator();
@@ -2103,7 +2133,7 @@ public:
 	 *
 	 */
 	template<unsigned int ... prp,typename id_1, typename id_2, bool is_sparse>
-	void merge_sort(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>,id_1,id_2,is_sparse> & cl, size_t n_thr = 1024)
+	void merge_sort(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>,id_1,id_2,is_sparse> & cl, size_t n_thr = default_kernel_wg_threads_)
 	{
 #if defined(__NVCC__)
 
@@ -2190,7 +2220,7 @@ public:
 	 * \parameter Cell-list from which has been constructed the sorted vector
 	 *
 	 */
-	template<unsigned int ... prp> void merge_sort_with_pos(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> & cl, size_t n_thr = 1024)
+	template<unsigned int ... prp> void merge_sort_with_pos(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> & cl, size_t n_thr = default_kernel_wg_threads_)
 	{
 #if defined(__NVCC__)
 
@@ -2212,7 +2242,7 @@ public:
          * \return an iterator
          *
          */
-        auto getDomainIteratorDevice(size_t n_thr = 1024) const -> decltype(this->getDomainIteratorGPU(n_thr))
+        auto getDomainIteratorDevice(size_t n_thr = default_kernel_wg_threads_) const -> decltype(this->getDomainIteratorGPU(n_thr))
         {
                 return this->getDomainIteratorGPU(n_thr);
         }
@@ -2225,7 +2255,7 @@ public:
          * \return an iterator
          *
          */
-        auto getDomainIteratorDevice(size_t n_thr = 1024) const -> decltype(this->getDomainIterator())
+        auto getDomainIteratorDevice(size_t n_thr = default_kernel_wg_threads_) const -> decltype(this->getDomainIterator())
         {
                 return this->getDomainIterator();
         }
@@ -2657,8 +2687,8 @@ public:
 		if ((opt & 0x0FFF0000) == CSV_WRITER)
 		{
 			// CSVWriter test
-			CSVWriter<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
-			          openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> > csv_writer;
+			CSVWriter<openfpm::vector<Point<dim, St>,Memory,layout_base>,
+			          openfpm::vector<prop,Memory,layout_base> > csv_writer;
 
 			std::string output = std::to_string(out + "_" + std::to_string(v_cl.getProcessUnitID()) + std::to_string(".csv"));
 
@@ -2673,8 +2703,8 @@ public:
 				ft = file_type::BINARY;
 
 			// VTKWriter for a set of points
-			VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
-									   openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>>,
+			VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,layout_base>,
+									   openfpm::vector<prop,Memory,layout_base>>,
 			                           VECTOR_POINTS> vtk_writer;
 			vtk_writer.add(v_pos,v_prp,g_m);
 
@@ -2753,8 +2783,8 @@ public:
 		if ((opt & 0x0FFF0000) == CSV_WRITER)
 		{
 			// CSVWriter test
-			CSVWriter<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
-					  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> > csv_writer;
+			CSVWriter<openfpm::vector<Point<dim, St>,Memory,layout_base>,
+					  openfpm::vector<prop,Memory,layout_base> > csv_writer;
 
 			std::string output = std::to_string(out + "_" + std::to_string(v_cl.getProcessUnitID()) + "_" + std::to_string(iteration) + std::to_string(".csv"));
 
@@ -2769,8 +2799,8 @@ public:
 				ft = file_type::BINARY;
 
 			// VTKWriter for a set of points
-			VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
-									   openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>>, VECTOR_POINTS> vtk_writer;
+			VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,layout_base>,
+									   openfpm::vector<prop,Memory,layout_base>>, VECTOR_POINTS> vtk_writer;
 			vtk_writer.add(v_pos,v_prp,g_m);
 
 			std::string output = std::to_string(out + "_" + std::to_string(v_cl.getProcessUnitID()) + "_" + std::to_string(iteration) + std::to_string(".vtp"));
@@ -2842,7 +2872,7 @@ public:
 	 * \return the particle position vector
 	 *
 	 */
-	const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVector() const
+	const openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVector() const
 	{
 		return v_pos;
 	}
@@ -2852,7 +2882,7 @@ public:
 	 * \return the particle position vector
 	 *
 	 */
-	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVector()
+	openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVector()
 	{
 		return v_pos;
 	}
@@ -2862,7 +2892,7 @@ public:
 	 * \return the particle property vector
 	 *
 	 */
-	const openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVector() const
+	const openfpm::vector<prop,Memory,layout_base> & getPropVector() const
 	{
 		return v_prp;
 	}
@@ -2872,7 +2902,7 @@ public:
 	 * \return the particle property vector
 	 *
 	 */
-	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVector()
+	openfpm::vector<prop,Memory,layout_base> & getPropVector()
 	{
 		return v_prp;
 	}
@@ -2882,7 +2912,7 @@ public:
 	 * \return the particle position vector
 	 *
 	 */
-	const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVectorSort() const
+	const openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVectorSort() const
 	{
 		return v_pos_out;
 	}
@@ -2892,7 +2922,7 @@ public:
 	 * \return the particle position vector
 	 *
 	 */
-	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVectorSort()
+	openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVectorSort()
 	{
 		return v_pos_out;
 	}
@@ -2902,7 +2932,7 @@ public:
 	 * \return the particle property vector
 	 *
 	 */
-	const openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVectorSort() const
+	const openfpm::vector<prop,Memory,layout_base> & getPropVectorSort() const
 	{
 		return v_prp_out;
 	}
@@ -2912,7 +2942,7 @@ public:
 	 * \return the particle property vector
 	 *
 	 */
-	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVectorSort()
+	openfpm::vector<prop,Memory,layout_base> & getPropVectorSort()
 	{
 		return v_prp_out;
 	}
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index 6eeb9179f5f77b06f80dc7d97ec9dbc53bac2a2c..f31f4538a454f85851b1a2c221a033e8190b6dfe 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -11,9 +11,7 @@
 #define TEST1
 
 #if defined(CUDA_GPU) && defined(__NVCC__)
-#include "util/cuda/moderngpu/kernel_mergesort.hxx"
 #include "Vector/cuda/vector_dist_cuda_funcs.cuh"
-#include "util/cuda/moderngpu/kernel_scan.hxx"
 #include "util/cuda/kernels.cuh"
 #endif
 
@@ -351,7 +349,7 @@ class vector_dist_comm
 	size_t v_sub_unit_factor = 64;
 
 	//! definition of the send vector for position
-	typedef openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity> send_pos_vector;
+	typedef openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity> send_pos_vector;
 
 	//! VCluster
 	Vcluster<Memory> & v_cl;
@@ -369,7 +367,6 @@ class vector_dist_comm
 	//! third id is the processor id
 	openfpm::vector<aggregate<int,int,int>,
 					Memory,
-					typename layout_base<aggregate<int,int,int>>::type,
 					layout_base > m_opart;
 
 	//! Per processor ordered particles id for ghost_get (see prc_g_opart)
@@ -380,14 +377,13 @@ class vector_dist_comm
 	//! Same as g_opart but on device, the vector of vector is flatten into a single vector
     openfpm::vector<aggregate<unsigned int,unsigned long int>,
                     CudaMemory,
-                    typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
                     memory_traits_inte> g_opart_device;
 
 	//! Helper buffer for computation (on GPU) of local particles (position)
-	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos_tmp;
+	openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_tmp;
 
 	//! Helper buffer for computation (on GPU) of local particles (properties)
-	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp_tmp;
+	openfpm::vector<prop,Memory,layout_base> v_prp_tmp;
 
 	//! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
 	openfpm::vector<size_t> g_opart_sz;
@@ -426,17 +422,15 @@ class vector_dist_comm
 	//! temporary buffer to processors ids
     openfpm::vector<aggregate<unsigned int>,
                             Memory,
-                            typename layout_base<aggregate<unsigned int>>::type,
                             layout_base> proc_id_out;
 
     //! temporary buffer for the scan result
 	openfpm::vector<aggregate<unsigned int>,
                              Memory,
-                             typename layout_base<aggregate<unsigned int>>::type,
                              layout_base> starts;
 
 	//! Processor communication size
-	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_offset;
+	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_offset;
 
 
 	//! Temporary CudaMemory to do stuff
@@ -512,7 +506,7 @@ class vector_dist_comm
 	 * \param prc_r processor ids
 	 *
 	 */
-	inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_sz,
+	inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz,
 								  openfpm::vector<size_t> & prc_sz_r,
 								  openfpm::vector<size_t> & prc_r,
 								  size_t opt)
@@ -575,17 +569,17 @@ class vector_dist_comm
 	openfpm::vector_std<openfpm::vector_std<Box<dim, St>>> box_f;
 
 	//! The boxes touching the border of the domain + shift vector linearized from where they come from
-	openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> box_f_dev;
-	openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> box_f_sv;
+	openfpm::vector<Box<dim, St>,Memory,layout_base> box_f_dev;
+	openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv;
 
 	//! Store the sector for each group (previous vector)
 	openfpm::vector_std<comb<dim>> box_cmb;
 
 	//! Id of the local particle to replicate for ghost_get
-	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> o_part_loc;
+	openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> o_part_loc;
 
 	//! Processor communication size
-	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_sz;
+	openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_sz;
 
 	/*! \brief For every internal ghost box we create a structure that order such internal local ghost box in
 	 *         shift vectors
@@ -680,12 +674,12 @@ class vector_dist_comm
 	 * \param opt options
 	 *
 	 */
-	void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			                    openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			                    openfpm::vector<prop,Memory,layout_base> & v_prp,
 			                    size_t opt)
 	{
 		// get the shift vectors
-		const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors();
+		const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors();
 
 		if (!(opt & NO_POSITION))
 		{
@@ -737,14 +731,14 @@ class vector_dist_comm
 	 * \param g_m ghost marker
 	 *
 	 */
-	void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			                  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			                  openfpm::vector<prop,Memory,layout_base> & v_prp,
 			                  size_t g_m,size_t opt)
 	{
 		o_part_loc.clear();
 
 		// get the shift vectors
-		const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors();
+		const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
 
 		if (opt & RUN_ON_DEVICE)
 		{
@@ -848,8 +842,8 @@ class vector_dist_comm
 	 * \param opt options
 	 *
 	 */
-	void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			                  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp ,
+	void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			                  openfpm::vector<prop,Memory,layout_base> & v_prp ,
 			                  size_t & g_m,
 			                  size_t opt)
 	{
@@ -876,14 +870,14 @@ class vector_dist_comm
 	 * \param g_pos_send Send buffer to fill
 	 *
 	 */
-	void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
+	void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
 								 openfpm::vector<size_t> & prc_sz,
 			                     openfpm::vector<send_pos_vector> & g_pos_send,
 			                     size_t opt,
 			                     bool async)
 	{
 		// get the shift vectors
-		const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors();
+		const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors();
 
 		// create a number of send buffers equal to the near processors
 		g_pos_send.resize(prc_sz.size());
@@ -965,7 +959,7 @@ class vector_dist_comm
 	 *
 	 */
 	template<typename send_vector, typename prp_object, int ... prp>
-	void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp,
 									 openfpm::vector<send_vector> & g_send_prp,
 									 size_t & g_m,
 									 size_t opt)
@@ -1040,9 +1034,9 @@ class vector_dist_comm
 				for (size_t j = accum; j < accum + n_part_recv; j++)
 				{
 					// source object type
-					typedef encapc<1, prop, typename openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>::layout_type> encap_src;
+					typedef encapc<1, prop, typename openfpm::vector<prop,Memory,layout_base>::layout_type> encap_src;
 					// destination object type
-					typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base>::layout_type> encap_dst;
+					typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,layout_base>::layout_type> encap_dst;
 
 					// Copy only the selected properties
 					object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2));
@@ -1154,7 +1148,7 @@ class vector_dist_comm
 	 *
 	 */
 	template<typename send_vector, typename prp_object, int ... prp>
-	void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp,
 								 openfpm::vector<size_t> & prc_sz,
 			                     openfpm::vector<send_vector> & g_send_prp,
 			                     size_t opt)
@@ -1245,13 +1239,13 @@ class vector_dist_comm
 	 *        This parameter is used only in case of RUN_ON_DEVICE option
 	 *
 	 */
-	void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			               openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			               openfpm::vector<prop,Memory,layout_base> & v_prp,
 			               openfpm::vector<size_t> & prc_sz_r,
 			               openfpm::vector<size_t> & prc_r,
-			               openfpm::vector<openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>> & m_pos,
-			               openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>> & m_prp,
-			               openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> & prc_sz,
+			               openfpm::vector<openfpm::vector<Point<dim,St>,Memory,layout_base,openfpm::grow_policy_identity>> & m_pos,
+			               openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> & m_prp,
+			               openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> & prc_sz,
 			               size_t opt)
 	{
 		m_prp.resize(prc_sz_r.size());
@@ -1397,7 +1391,7 @@ class vector_dist_comm
 	 */
 	template<typename prp_object,int ... prp>
 	void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos,
-			                    openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+			                    openfpm::vector<prop,Memory,layout_base> & v_prp,
 								openfpm::vector<size_t> & prc_sz_r,
 								openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos,
 								openfpm::vector<openfpm::vector<prp_object>> & m_prp)
@@ -1438,12 +1432,11 @@ class vector_dist_comm
 	 * \param opt options
 	 *
 	 */
-	template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
+	template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
 			                                           openfpm::vector<aggregate<int,int,int>,
 			                                                           Memory,
-			                                                           typename layout_base<aggregate<int,int,int>>::type,
 			                                                           layout_base> & lbl_p,
-			                                           openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_sz,
+			                                           openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz,
 			                                           size_t opt)
 	{
 		if (opt == RUN_ON_DEVICE)
@@ -1601,11 +1594,11 @@ class vector_dist_comm
 	 * \param opt ghost_get options
 	 *
 	 */
-	void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			                 openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			                 openfpm::vector<prop,Memory,layout_base> & v_prp,
 			                 openfpm::vector<size_t> & prc,
 			                 openfpm::vector<size_t> & prc_sz,
-			                 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
+			                 openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset,
 			                 size_t & g_m,
 			                 size_t opt)
 	{
@@ -1814,6 +1807,31 @@ public:
 		dec.decompose();
 	}
 
+	/*! \brief Initialize the decomposition
+	 *
+	 * \param box domain
+	 * \param bc boundary conditions
+	 * \param g ghost extension
+	 * \param opt additional options
+	 *
+	 */
+	void init_decomposition_gr_cell(Box<dim,St> & box,
+							const size_t (& bc)[dim],
+							const Ghost<dim,St> & g,
+							size_t opt,
+							const grid_sm<dim,void> & gdist)
+	{
+		size_t div[dim];
+
+		for (size_t i = 0 ; i < dim ; i++)
+		{div[i] = gdist.size(i);}
+
+		// Create the sub-domains
+		dec.setParameters(div, box, bc, g);
+
+		dec.decompose();
+	}
+
 	/*! \brief It synchronize the properties and position of the ghost particles
 	 *
 	 * \tparam prp list of properties to get synchronize
@@ -1824,8 +1842,8 @@ public:
 	 * \param g_m marker between real and ghost particles
 	 *
 	 */
-	template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-												 openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+												 openfpm::vector<prop,Memory,layout_base> & v_prp,
 												 size_t & g_m,
 												 size_t opt = WITH_POSITION)
 	{
@@ -1837,7 +1855,7 @@ public:
 		typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
 
 		// send vector for each processor
-		typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base,openfpm::grow_policy_identity> send_vector;
+		typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector;
 
 		if (!(opt & NO_POSITION))
 		{v_pos.resize(g_m);}
@@ -1911,8 +1929,8 @@ public:
 	 * \param g_m marker between real and ghost particles
 	 *
 	 */
-	template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			 	 	 	 	 	 	 	 	 	 	 	 	 	 	 openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			 	 	 	 	 	 	 	 	 	 	 	 	 	 	 openfpm::vector<prop,Memory,layout_base> & v_prp,
 			 	 	 	 	 	 	 	 	 	 	 	 	 	 	 size_t & g_m,
 			 	 	 	 	 	 	 	 	 	 	 	 	 	 	 size_t opt = WITH_POSITION)
 	{
@@ -1920,7 +1938,7 @@ public:
 		typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
 
 		// send vector for each processor
-		typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base,openfpm::grow_policy_identity> send_vector;
+		typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector;
 
 		// Send and receive ghost particle information
 		openfpm::vector<send_vector> g_send_prp;
@@ -1962,7 +1980,7 @@ public:
 		typedef KillParticle obp;
 
 		// Processor communication size
-		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> prc_sz(v_cl.getProcessingUnits());
+		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> prc_sz(v_cl.getProcessingUnits());
 
 		// map completely reset the ghost part
 		v_pos.resize(g_m);
@@ -2028,8 +2046,8 @@ public:
 	 *
 	 */
 	template<typename obp = KillParticle>
-	void map_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
-			  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, size_t & g_m,
+	void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+			  openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m,
 			  size_t opt)
 	{
 #ifdef PROFILE_SCOREP
@@ -2053,9 +2071,9 @@ public:
 		calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt);
 
 		//! position vector
-		openfpm::vector<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>> m_pos;
+		openfpm::vector<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>> m_pos;
 		//! properties vector
-		openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>> m_prp;
+		openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> m_prp;
 
 		fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt);
 
@@ -2071,13 +2089,13 @@ public:
 #endif
 		}
 
-		v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>,
-					   openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
+		v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>,
+					   openfpm::vector<Point<dim, St>,Memory,layout_base>,
 					   layout_base>
 					   (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_);
 
-		v_cl.template SSendRecv<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>,
-					   openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>,
+		v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>,
+					   openfpm::vector<prop,Memory,layout_base>,
 					   layout_base>
 					   (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_);
 
@@ -2146,8 +2164,8 @@ public:
 	 *
 	 */
 	template<template<typename,typename> class op, int ... prp>
-	void ghost_put_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim, St>>::type,layout_base> & v_pos,
-					openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+	void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos,
+					openfpm::vector<prop,Memory,layout_base> & v_prp,
 					size_t & g_m,
 					size_t opt)
 	{
@@ -2155,7 +2173,7 @@ public:
 		typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
 
 		// send vector for each processor
-		typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base> send_vector;
+		typedef openfpm::vector<prp_object,Memory,layout_base> send_vector;
 
 		openfpm::vector<send_vector> g_send_prp;
 		fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt);
diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp
index 435515570da202555cb4e034bf2a1e2d16807b8e..9e49687b9a7ce2a5d426ca33de0bc5aabe61b244 100644
--- a/src/Vector/vector_dist_kernel.hpp
+++ b/src/Vector/vector_dist_kernel.hpp
@@ -257,7 +257,7 @@ public:
 	 * \return an iterator
 	 *
 	 */
-	__host__ ite_gpu<1> getDomainIteratorGPU(size_t n_thr = 1024) const
+	__host__ ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return v_pos.getGPUIteratorTo(g_m,n_thr);
 	}
diff --git a/src/cmake/openfpmConfig-configure.cmake b/src/cmake/openfpmConfig-configure.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c7b0f8a11888857afc304257b4fa54da81c57c88
--- /dev/null
+++ b/src/cmake/openfpmConfig-configure.cmake
@@ -0,0 +1,4 @@
+get_property(OPENFPM_INCLUDES TARGET openfpm::binary_config PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+get_property(OPENFPM_DEFINITION TARGET openfpm::binary_config PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+get_property(OPENFPM_LIBS TARGET openfpm::binary_config PROPERTY INTERFACE_LINK_LIBRARIES)
+get_property(OPENFPM_COMPILE_OPTIONS TARGET openfpm::binary_config PROPERTY INTERFACE_COMPILE_OPTIONS)
diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in
index c38415e8e75b5cbc443ac8ed6eafc387f97559e2..ef3eb90ec3ed96273f9a864c41a8a404171b104e 100644
--- a/src/config/config_cmake.h.in
+++ b/src/config/config_cmake.h.in
@@ -4,6 +4,13 @@ ${DEFINE_COVERTY_SCAN}
 /* GPU support */
 ${DEFINE_CUDA_GPU}
 
+/* HIP GPU support */
+${DEFINE_HIP_GPU}
+
+/* HIP Cudify GPU support */
+${DEFINE_CUDIFY_USE_HIP}
+
+
 /* Debug */
 ${DEFINE_DEBUG} /**/
 
@@ -32,6 +39,12 @@ ${DEFINE_HAVE_BOOST_PROGRAM_OPTIONS} /**/
 /* define if the Boost::Unit_Test_Framework library is available */
 ${DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK} /**/
 
+/* define if the Boost::Context library is available */
+${DEFINE_HAVE_BOOST_CONTEXT} /**/
+
+/* define if the Boost::Fiber library is available */
+${DEFINE_HAVE_BOOST_FIBER} /**/
+
 /* Have clock time */
 ${DEFINE_HAVE_CLOCK_GETTIME} /**/
 
@@ -116,6 +129,19 @@ ${DEFINE_ACTION_ON_ERROR}
 /* NVCC compiling */
 ${DEFINE_NVCC} /**/
 
+/* Define if we have Alpaka */
+${DEFINE_HAVE_ALPAKA}
+
+/* Additional alpaka definitions  */
+${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEF}
+${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEF}
+${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEF}
+${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEF}
+${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEF}
+${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEF}
+${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEF}
+
+
 /* Name of package */
 #define PACKAGE "openfpm_pdata"
 
@@ -155,6 +181,12 @@ ${DEFINE_STDC_HEADERS}
 /* If an error occur stop the program */
 ${DEFINE_STOP_ON_ERROR}
 
+/* Garbage injector*/
+${DEFINE_GARBAGE_INJECTOR}
+
+/* VCluster Garbage injector*/
+${DEFINE_VCLUSTER_GARBAGE_INJECTOR}
+
 /* Test coverage mode */
 ${DEFINE_TEST_COVERAGE_MODE}
 
diff --git a/src/isolation.cu b/src/isolation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..29b113100fc359acacfe4832602f3ce7a597f1ca
--- /dev/null
+++ b/src/isolation.cu
@@ -0,0 +1,270 @@
+#include <iostream>
+#include <thread>
+
+size_t debug_tot_call = 0;
+
+#define PRINT_STACKTRACE
+#define CHECKFOR_POSNAN
+#define CHECKFOR_POSINF
+#define CHECKFOR_PROPNAN
+#define CHECKFOR_PROPINF
+
+#define NO_WARNING
+#include "Graph/CartesianGraphFactory.hpp"
+
+void timeout_cycle()
+{
+	// 6 seconds
+	std::this_thread::sleep_for (std::chrono::seconds(900));
+
+	std::cout << "Time Out" << std::endl;
+	std::exit(1);
+}
+
+
+#define BOOST_DISABLE_ASSERTS
+
+
+#include "config.h"
+#undef VERSION
+
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include "VCluster/VCluster.hpp"
+#include <Vector/vector_dist.hpp>
+#include "Vector/tests/vector_dist_util_unit_tests.hpp"
+
+// initialization function:
+bool init_unit_test()
+{
+//  std::thread to (timeout_cycle);
+//  to.detach();
+  return true;
+}
+
+// entry point
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+  return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
+}
+
+
+
+BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
+{
+	
+
+	Vcluster<> & v_cl = create_vcluster();
+
+	long int k = 25*25*25*create_vcluster().getProcessingUnits();
+	k = std::pow(k, 1/3.);
+
+	if (v_cl.getProcessingUnits() > 48)
+		return;
+
+	BOOST_TEST_CHECKPOINT( "Testing 3D periodic ghost put k=" << k );
+
+	long int big_step = k / 30;
+	big_step = (big_step == 0)?1:big_step;
+	long int small_step = 21;
+
+	// 3D test
+	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
+	{
+		float r_cut = 1.3 / k;
+		float r_g = 1.5 / k;
+
+		Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+		// Boundary conditions
+		size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
+
+		// ghost
+		Ghost<3,float> ghost(r_g);
+
+		typedef  aggregate<float,float,float> part_prop;
+
+		// Distributed vector
+		vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost);
+
+		auto it = vd.getGridIterator({(size_t)k,(size_t)k,(size_t)k});
+
+		while (it.isNext())
+		{
+			auto key = it.get();
+
+			vd.add();
+
+			vd.getLastPosWrite()[0] = key.get(0)*it.getSpacing(0);
+			vd.getLastPosWrite()[1] = key.get(1)*it.getSpacing(1);
+			vd.getLastPosWrite()[2] = key.get(2)*it.getSpacing(2);
+
+			// Fill some properties randomly
+
+			vd.getLastPropWrite<0>() = 0.0;
+
+			vd.getLastPropWrite<2>() = 0.0;
+
+			++it;
+		}
+
+		vd.map();
+
+		vd.hostToDevicePos();
+		vd.template hostToDeviceProp<0,2>();
+		// sync the ghost
+		vd.ghost_get<0,2>(RUN_ON_DEVICE);
+		vd.template deviceToHostProp<0,2>();
+		vd.deviceToHostPos();
+
+		{
+			auto NN = vd.getCellList(r_cut);
+			float a = 1.0f*k*k;
+
+			// run trough all the particles + ghost
+
+			auto it2 = vd.getDomainIterator();
+
+			while (it2.isNext())
+			{
+				// particle p
+				auto p = it2.get();
+				Point<3,float> xp = vd.getPos(p);
+
+				// Get an iterator over the neighborhood particles of p
+				auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp));
+
+				// For each neighborhood particle ...
+				while (Np.isNext())
+				{
+					auto q = Np.get();
+					Point<3,float> xq = vd.getPosRead(q);
+
+					float dist = xp.distance(xq);
+
+					if (dist < r_cut)
+					{
+						vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
+						vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
+					}
+
+					++Np;
+				}
+
+				++it2;
+			}
+
+			vd.hostToDevicePos();
+			vd.template hostToDeviceProp<0,2>();
+			vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE);
+			vd.template deviceToHostProp<0,2>();
+			vd.deviceToHostPos();
+
+			bool ret = true;
+			auto it3 = vd.getDomainIterator();
+
+			float constant = vd.getProp<0>(it3.get());
+			float eps = 0.001;
+
+			while (it3.isNext())
+			{
+				float constant2 = vd.getProp<0>(it3.get());
+				float constant3 = vd.getProp<2>(it3.get());
+				if (fabs(constant - constant2)/constant > eps || fabs(constant - constant3)/constant > eps)
+				{
+					Point<3,float> p = vd.getPosRead(it3.get());
+
+					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "/" << constant3 << "    " << v_cl.getProcessUnitID() << std::endl;
+					ret = false;
+					break;
+				}
+
+				++it3;
+			}
+			BOOST_REQUIRE_EQUAL(ret,true);
+		}
+
+		auto itp = vd.getDomainAndGhostIterator();
+		while (itp.isNext())
+		{
+			auto key = itp.get();
+
+			vd.getPropWrite<0>(key) = 0.0;
+			vd.getPropWrite<2>(key) = 0.0;
+
+			++itp;
+		}
+
+		{
+			auto NN = vd.getCellList(r_cut);
+			float a = 1.0f*k*k;
+
+			// run trough all the particles + ghost
+
+			auto it2 = vd.getDomainIterator();
+
+			while (it2.isNext())
+			{
+				// particle p
+				auto p = it2.get();
+				Point<3,float> xp = vd.getPosRead(p);
+
+				// Get an iterator over the neighborhood particles of p
+				auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp));
+
+				// For each neighborhood particle ...
+				while (Np.isNext())
+				{
+					auto q = Np.get();
+					Point<3,float> xq = vd.getPosRead(q);
+
+					float dist = xp.distance(xq);
+
+					if (dist < r_cut)
+					{
+						vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
+						vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
+					}
+
+					++Np;
+				}
+
+				++it2;
+			}
+
+			vd.hostToDevicePos();
+			vd.template hostToDeviceProp<0,2>();
+			vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
+			vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE);
+			vd.template deviceToHostProp<0,2>();
+			vd.deviceToHostPos();
+
+			bool ret = true;
+			auto it3 = vd.getDomainIterator();
+
+			float constant = vd.getPropRead<0>(it3.get());
+			float eps = 0.001;
+
+			while (it3.isNext())
+			{
+				float constant2 = vd.getPropRead<0>(it3.get());
+				float constant3 = vd.getPropRead<0>(it3.get());
+				if (fabs(constant - constant2)/constant > eps || fabs(constant - constant3)/constant > eps)
+				{
+					Point<3,float> p = vd.getPosRead(it3.get());
+
+					std::cout << p.toString() << "    " <<  constant2 << "/" << constant << "/" << constant3 << "  " << it3.get().getKey() << "    " << v_cl.getProcessUnitID() << std::endl;
+					ret = false;
+					break;
+				}
+
+				++it3;
+			}
+			BOOST_REQUIRE_EQUAL(ret,true);
+		}
+	}
+
+	openfpm_finalize();
+}
\ No newline at end of file
diff --git a/src/lib/pdata.cpp b/src/lib/pdata.cpp
index aa5ceb8b2d4af36b719d87b7c190ceae6afd43cd..731f562cd0f64d8fec424cfb7f2843c187ebe654 100644
--- a/src/lib/pdata.cpp
+++ b/src/lib/pdata.cpp
@@ -23,3 +23,7 @@ const std::string nm_e::attributes::name[] = {"communication","srcgid","dstgid"}
 const std::string nm_part_v::attributes::name[] = {"id","sub_id"};
 const std::string nm_part_e::attributes::name[] = {"id"};
 
+double tot_merge = 0.0;
+double tot_loc_merge = 0.0;
+double tot_sendrecv = 0.0;
+double tot_pack = 0.0;
diff --git a/src/lib/pdata.hpp b/src/lib/pdata.hpp
index 919f9d6454cbd2f1668ec01fe77f799cb7d963ef..5418375628606322744fb42d3f801475e2eaedf6 100644
--- a/src/lib/pdata.hpp
+++ b/src/lib/pdata.hpp
@@ -6,5 +6,9 @@
 constexpr int comp_host = 1;
 constexpr int comp_dev = 2;
 
+extern double tot_merge;
+extern double tot_loc_merge;
+extern double tot_sendrecv;
+extern double tot_pack;
 
 #endif
diff --git a/src/scripts/postflight b/src/scripts/postflight
new file mode 100644
index 0000000000000000000000000000000000000000..0a932ae788699b4fb700e0a6fb005b8608d3535c
--- /dev/null
+++ b/src/scripts/postflight
@@ -0,0 +1,13 @@
+#! /bin/bash
+
+sed -i '' -e 's/Users\/rundeck/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/source/openfpm_vars
+sed -i '' -e 's/openfpm_pre/openfpm/g' /usr/local/openfpm/source/openfpm_vars
+sed -i '' -e 's/dependencies\/openfpm_dependencies/dependencies/g' /usr/local/openfpm/source/openfpm_vars
+sed -i '' -e 's/Users\/rundeck/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/openfpm_pdata/include/example.mk
+sed -i '' -e 's/openfpm_pre/openfpm/g' /usr/local/openfpm/openfpm_pdata/include/example.mk
+sed -i '' -e 's/dependencies\/openfpm_dependencies/dependencies/g' /usr/local/openfpm/openfpm_pdata/include/example.mk
+
+chmod a+x /usr/local/openfpm/dependencies/MPI/bin/*
+
+echo "export OPAL_PREFIX=/usr/local/openfpm/dependencies/MPI" >> /usr/local/openfpm/source/openfpm_vars
+
diff --git a/src/scripts/postinst b/src/scripts/postinst
new file mode 100644
index 0000000000000000000000000000000000000000..763d3ed561a4529800f9222de37392223c86c1b1
--- /dev/null
+++ b/src/scripts/postinst
@@ -0,0 +1,9 @@
+#! /bin/bash
+
+/projects\/ppm\/rundeck\/openfpm_super_bundles\/$1\/openfpm_dep_$1
+
+sed -i -e 's/projects\/ppm\/rundeck\/openfpm_super_bundles\/$1\/openfpm_dep_$1/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/source/openfpm_vars
+sed -i -e 's/projects\/ppm\/rundeck\/openfpm_super_bundles\/$1\/openfpm_dep_$1/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/openfpm_pdata/include/example.mk
+echo "export OPAL_PREFIX=/usr/local/openfpm/dependencies/MPI" >> /usr/local/openfpm/source/openfpm_vars
+
+
diff --git a/test_data/sgrid_gpu_output_1_0.vtk b/test_data/sgrid_gpu_output_1_0.vtk
index 4435178f5d4ab01c3c81f53f813db628a5871d4c..72ec5aa822068e2094132628121214e86ebbd211 100644
Binary files a/test_data/sgrid_gpu_output_1_0.vtk and b/test_data/sgrid_gpu_output_1_0.vtk differ
diff --git a/test_data/sgrid_gpu_output_2_0.vtk b/test_data/sgrid_gpu_output_2_0.vtk
index 4a2881cb496b475a232872cbff048fbcf40b56ab..5eec6e0e85213e0d8931fbf7b0b1fe6bf2b2e75d 100644
Binary files a/test_data/sgrid_gpu_output_2_0.vtk and b/test_data/sgrid_gpu_output_2_0.vtk differ
diff --git a/test_data/sgrid_gpu_output_2_1.vtk b/test_data/sgrid_gpu_output_2_1.vtk
index e9d5c3f52ebe3c43ab7ff2ba66048c893ad4145a..aa60b77a4dff9c1b28009fd600edb99fe7fd24e4 100644
Binary files a/test_data/sgrid_gpu_output_2_1.vtk and b/test_data/sgrid_gpu_output_2_1.vtk differ
diff --git a/test_data/sgrid_gpu_output_3_0.vtk b/test_data/sgrid_gpu_output_3_0.vtk
index 4a5b5abc09ecf5e98c38429a10c315ae6e69548b..6c3cd35349fa41292454b1fcac5b152b0642192b 100644
Binary files a/test_data/sgrid_gpu_output_3_0.vtk and b/test_data/sgrid_gpu_output_3_0.vtk differ
diff --git a/test_data/sgrid_gpu_output_3_1.vtk b/test_data/sgrid_gpu_output_3_1.vtk
index a56ffc0cf3af612d82623b60f907b98df7419474..1225f50b8007222b6cce5a4b01faf3b639341f01 100644
Binary files a/test_data/sgrid_gpu_output_3_1.vtk and b/test_data/sgrid_gpu_output_3_1.vtk differ
diff --git a/test_data/sgrid_gpu_output_3_2.vtk b/test_data/sgrid_gpu_output_3_2.vtk
index bcd4d9291097691367c5b6251c3b4ebb7e478ab8..3d0eebb7e615f16e7862a62df2d60df61879672d 100644
Binary files a/test_data/sgrid_gpu_output_3_2.vtk and b/test_data/sgrid_gpu_output_3_2.vtk differ
diff --git a/test_data/test_data_three.h5 b/test_data/test_data_three.h5
new file mode 100644
index 0000000000000000000000000000000000000000..95c9f16dc62de0e39d558c96005849c52d9cee88
Binary files /dev/null and b/test_data/test_data_three.h5 differ