From 1a1686d9414d652c95222d0de0822f19af90da67 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Fri, 24 Aug 2018 03:14:31 +0200
Subject: [PATCH] New MPI + MAP on device finally working

---
 configure.ac                                  |   1 +
 images/CartDecomposition_gen_vtk.cpp          |   2 +-
 images/Makefile.am                            |   8 +-
 images/vector.cpp                             |   4 +-
 install                                       |   2 +-
 openfpm_data                                  |   2 +-
 openfpm_devices                               |   2 +-
 openfpm_vcluster                              |   2 +-
 script/detect_gcc                             |  28 +++--
 script/install_MPI.sh                         |   3 +-
 script/remove_old                             |   2 +-
 src/DLB/DLB.hpp                               |   4 +-
 src/Decomposition/CartDecomposition.hpp       |  33 +++++-
 src/Decomposition/CartDecomposition_ext.hpp   |   2 +-
 .../Distribution/DistParMetisDistribution.hpp |   4 +-
 .../Distribution/Distribution_unit_tests.hpp  |   8 +-
 .../Distribution/MetisDistribution.hpp        |   4 +-
 .../Distribution/ParMetisDistribution.hpp     |   4 +-
 .../Distribution/SpaceDistribution.hpp        |   4 +-
 .../Distribution/metis_util_unit_test.hpp     |   2 +-
 .../Distribution/parmetis_dist_util.hpp       |   4 +-
 .../Distribution/parmetis_util.hpp            |   4 +-
 src/Decomposition/ORB.hpp                     |   2 +-
 .../cuda/CartDecomposition_gpu.cuh            |   3 +
 .../cuda/decomposition_cuda_tests.cu          |   2 +-
 src/Decomposition/dec_optimizer_unit_test.hpp |   2 +-
 src/Decomposition/ie_ghost.hpp                |   6 +-
 src/Decomposition/nn_processor_unit_test.hpp  |   6 +-
 .../tests/CartDecomposition_unit_test.cpp     |  12 +--
 src/Graph/DistGraphFactory.hpp                |   4 +-
 src/Graph/dist_map_graph.hpp                  |   4 +-
 src/Graph/dist_map_graph_unit_test.hpp        |   8 +-
 .../grid_dist_id_iterators_unit_tests.hpp     |  12 +--
 src/Grid/grid_dist_id.hpp                     |   4 +-
 src/Grid/grid_dist_id_comm.hpp                |   2 +-
 ...grid_dist_id_HDF5_chckpnt_restart_test.cpp |   4 +-
 src/Grid/tests/grid_dist_id_unit_test.cpp     |  34 +++---
 .../tests/grid_dist_id_unit_test_ext_dom.hpp  |   2 +-
 .../grid_dist_id_unit_test_unb_ghost.hpp      |   6 +-
 src/Makefile.am                               |  13 ++-
 src/Vector/cuda/vector_dist_cuda_func_test.cu |   7 +-
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    |   4 +-
 .../{ => cuda}/vector_dist_gpu_unit_tests.cu  |  86 ++++++++++++++-
 .../vector_dist_performance_common.hpp        |   4 +-
 .../vector_dist_HDF5_chckpnt_restart_test.cpp |   2 +-
 src/Vector/tests/vector_dist_NN_tests.cpp     |   8 +-
 .../tests/vector_dist_cell_list_tests.cpp     |  26 ++---
 .../vector_dist_complex_prp_unit_test.cpp     |   2 +-
 src/Vector/tests/vector_dist_unit_test.cpp    |  34 +++---
 .../tests/vector_dist_util_unit_tests.hpp     |   6 +-
 src/Vector/vector_dist.hpp                    |  12 +--
 src/Vector/vector_dist_comm.hpp               | 101 +++++++++++++-----
 src/Vector/vector_dist_dlb_test.hpp           |   8 +-
 src/initialize/initialize_wrapper.hpp         |  20 ++++
 src/initialize/initialize_wrapper_cpu.cpp     |  13 +++
 src/initialize/initialize_wrapper_cuda.cu     |  12 +++
 src/unit_test_init_cleanup.hpp                |   6 +-
 57 files changed, 410 insertions(+), 196 deletions(-)
 rename src/Vector/{ => cuda}/vector_dist_gpu_unit_tests.cu (82%)
 create mode 100644 src/initialize/initialize_wrapper.hpp
 create mode 100644 src/initialize/initialize_wrapper_cpu.cpp
 create mode 100644 src/initialize/initialize_wrapper_cuda.cu

diff --git a/configure.ac b/configure.ac
index fe3d677d..694eb372 100644
--- a/configure.ac
+++ b/configure.ac
@@ -412,6 +412,7 @@ if test x"$NVCC_EXIST" = x"yes"; then
   fi
 else
   gpu_support=no
+  INITIALIZATION_SOURCE="initialize_openfpm.cpp"
 fi
 
 if test x$gpu_support = x"no"; then
diff --git a/images/CartDecomposition_gen_vtk.cpp b/images/CartDecomposition_gen_vtk.cpp
index 6e16b942..aa52e9d3 100644
--- a/images/CartDecomposition_gen_vtk.cpp
+++ b/images/CartDecomposition_gen_vtk.cpp
@@ -13,7 +13,7 @@ int main(int argc, char ** argv)
 	openfpm_init(&argc,&argv);
 
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	//! [Create CartDecomposition vtk gen]
 	CartDecomposition<2,float> dec(vcl);
diff --git a/images/Makefile.am b/images/Makefile.am
index 5ec901fa..d25a0bcf 100644
--- a/images/Makefile.am
+++ b/images/Makefile.am
@@ -1,22 +1,22 @@
 LINKLIBS = $(HDF5_LDFLAGS)  $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB) $(PETSC_LIB) $(SUITESPARSE_LIBS) $(LAPACK_LIBS) $(BLAS_LIBS)  $(METIS_LIB) $(PARMETIS_LIB)  $(PTHREAD_LIBS) $(OPT_LIBS) $(BOOST_LDFLAGS) $(BOOST_IOSTREAMS_LIB) $(CUDA_LIBS)
 
 noinst_PROGRAMS = cart_dec metis_dec dom_box vector_dist
-cart_dec_SOURCES = CartDecomposition_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
+cart_dec_SOURCES = CartDecomposition_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/CudaMemory.cu ../openfpm_devices/src/memory/HeapMemory.cpp  ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 cart_dec_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(PARMETIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
 cart_dec_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
 cart_dec_LDADD = $(LINKLIBS) -lparmetis -lmetis
 
-metis_dec_SOURCES = Metis_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
+metis_dec_SOURCES = Metis_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 metis_dec_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
 metis_dec_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
 metis_dec_LDADD = $(LINKLIBS) -lmetis
 
-dom_box_SOURCES = domain_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
+dom_box_SOURCES = domain_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 dom_box_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
 dom_box_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
 dom_box_LDADD = $(LINKLIBS)
 
-vector_dist_SOURCES = vector.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_devices/src/Memleak_check.cpp
+vector_dist_SOURCES = vector.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_devices/src/Memleak_check.cpp
 vector_dist_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
 vector_dist_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
 vector_dist_LDADD = $(LINKLIBS) -lparmetis -lmetis
diff --git a/images/vector.cpp b/images/vector.cpp
index 9414d2e6..52b7d900 100644
--- a/images/vector.cpp
+++ b/images/vector.cpp
@@ -50,9 +50,7 @@ int main(int argc, char* argv[])
 	// randomly in the domain, we create a Box that define our domain, boundary conditions, and ghost
 	//
 	openfpm_init(&argc,&argv);
-	Vcluster & v_cl = create_vcluster();
-	
-	typedef Point<2,float> s;
+	Vcluster<> & v_cl = create_vcluster();
 
 	// set the seed
 	// create the random generator engine
diff --git a/install b/install
index 49063cb9..f5ae54e5 100755
--- a/install
+++ b/install
@@ -176,7 +176,6 @@ if [ $? -ne 0 ]; then
   exit 1
 fi
 
-
 ## check for options
 
 echo -e "\033[1;34;5mDEPENCENCIES INSTALLATION DIR      \033[0m"
@@ -317,6 +316,7 @@ HDF5_System_prv=1
 LIBHILBERT_installed=0
 conf_err=1
 
+
 if [ $install_req -eq 0 ]; then
     ./configure $options $configure_options "$configure_blas_option"
 else
diff --git a/openfpm_data b/openfpm_data
index 66fdd0bd..0261ed4b 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 66fdd0bdd308bd78efe3c14a7d2bf9290e35f550
+Subproject commit 0261ed4b804309864d899bedec46a67e5e6cfe95
diff --git a/openfpm_devices b/openfpm_devices
index c751d466..e5d5d31a 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit c751d466195fd0271acf1933a6b0e9d2233d3529
+Subproject commit e5d5d31a0af09312118f72c0818a824443ba80fb
diff --git a/openfpm_vcluster b/openfpm_vcluster
index ec95a342..11a0f824 160000
--- a/openfpm_vcluster
+++ b/openfpm_vcluster
@@ -1 +1 @@
-Subproject commit ec95a342bc6fc783099f3cb044ff4524348bfedf
+Subproject commit 11a0f824c111e686ad2388cec485ffcaafab5560
diff --git a/script/detect_gcc b/script/detect_gcc
index 0905b02b..46c89cba 100755
--- a/script/detect_gcc
+++ b/script/detect_gcc
@@ -4,20 +4,6 @@ function haveProg() {
     [ -x "$(command -v $1)" ]
 }
 
-function gpp_clang()
-{
-    if [ x"$possible_solutions_command" == x"g++" ]; then
-        CXX=g++
-        CC=gcc
-        F77=gfortran
-        FC=gfortran
-    else
-        CXX=clang++
-        CC=clang
-        F77=gfortran
-        FC=gfortran
-    fi
-}
 
 function detect_compiler()
 {
@@ -25,6 +11,7 @@ function detect_compiler()
     icpc_found=0
     dgc_ret=0
 
+
      if [ x"$CXX" != x"" -o x"$CC" != x"" -o x"$F77" != x"" -o x"$FC" != x"" ]; then
 
         if [ x"$CXX" == x"" ]; then
@@ -65,6 +52,7 @@ function detect_compiler()
         return
     fi
 
+
     # First we try to understand if g++ command line is linked to clang
     if haveProg g++; then
 
@@ -111,6 +99,7 @@ function detect_compiler()
         fi
       fi
 
+
       g++ --version | grep "g++" > /dev/null 2>&1
       if [ $? == 0 -a $gpp_found -ne 1 ]; then
         dgc_major=$(g++ --version | grep g++ | sed 's/.*\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\).*/\1/g')
@@ -147,6 +136,7 @@ function detect_compiler()
         fi
       fi
 
+
       g++ --version | grep "clang" > /dev/null 2>&1
       if [ $? == 0 -a $gpp_found -ne 1 ]; then
         dgc_major=$(g++ --version | grep g++ | sed 's/.*\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\).*/\1/g')
@@ -224,11 +214,17 @@ function detect_compiler()
           FC=ifort
           dgc_compiler=icpc
         else
-          gpp_clang
+          CXX=g++
+          CC=gcc
+          F77=gfortran
+          FC=gfortran
         fi
         dgc_ret=1
     elif [ $gpp_found -eq 1 ]; then
-        gpp_clang
+        CXX=g++
+        CC=gcc
+        F77=gfortran
+        FC=gfortran
     elif [ $icpc_found -eq 1 ]; then
           CXX=icpc
           CC=icc
diff --git a/script/install_MPI.sh b/script/install_MPI.sh
index aeaa2060..583a881d 100755
--- a/script/install_MPI.sh
+++ b/script/install_MPI.sh
@@ -25,7 +25,8 @@ cd openmpi-3.1.1
 #
 #
 
-./configure --with-cuda --prefix=$1/MPI --enable-mpi-fortran=yes CC=$3 CXX=$4 F77=$4 FC=$5
+
+./configure --with-cuda --prefix=$1/MPI --enable-mpi-fortran=yes CC=$3 CXX=$4 F77=$5 FC=$5
 make -j $2
 make install
 
diff --git a/script/remove_old b/script/remove_old
index d69d1ab7..459682d9 100755
--- a/script/remove_old
+++ b/script/remove_old
@@ -191,7 +191,7 @@ function remove_old()
         version=$(cat $1/MPI/version)
         if [ x"$version" != x"3"  ]; then
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
-            echo -e "\033[1;34;5m  MPI has been updated to version 2.1.1, the component will be updated automatically      \033[0m"
+            echo -e "\033[1;34;5m  MPI has been updated to version 3.1.1, the component will be updated automatically      \033[0m"
             echo -e "\033[1;34;5m  -------------------------------------------------------------------------------------- \033[0m"
             sleep 5
             rm -rf $1/MPI/include
diff --git a/src/DLB/DLB.hpp b/src/DLB/DLB.hpp
index 81a788ad..fa04bc53 100644
--- a/src/DLB/DLB.hpp
+++ b/src/DLB/DLB.hpp
@@ -69,7 +69,7 @@ public:
 private:
 
 	//! Runtime virtual cluster machine
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Structure that will contain all the timings
 	Times timeInfo;
@@ -165,7 +165,7 @@ public:
 	 *
 	 * \param v_cl virtual cluster object
 	 */
-	DLB(Vcluster & v_cl) :
+	DLB(Vcluster<> & v_cl) :
 			v_cl(v_cl)
 	{
 	}
diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp
index 1c212cbb..c651d93a 100755
--- a/src/Decomposition/CartDecomposition.hpp
+++ b/src/Decomposition/CartDecomposition.hpp
@@ -197,7 +197,7 @@ protected:
 	size_t magn[dim];
 
 	//! Runtime virtual cluster machine
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Create distribution
 	Distribution dist;
@@ -353,7 +353,7 @@ public:
 	 * \param opt option (one option is to construct)
 	 *
 	 */
-	void createSubdomains(Vcluster & v_cl, const size_t (& bc)[dim], size_t opt = 0)
+	void createSubdomains(Vcluster<> & v_cl, const size_t (& bc)[dim], size_t opt = 0)
 	{
 		int p_id = v_cl.getProcessUnitID();
 
@@ -708,7 +708,7 @@ public:
 	 * \param v_cl Virtual cluster, used internally to handle or pipeline communication
 	 *
 	 */
-	CartDecomposition(Vcluster & v_cl)
+	CartDecomposition(Vcluster<> & v_cl)
 	:nn_prcs<dim, T>(v_cl), v_cl(v_cl), dist(v_cl),ref_cnt(0)
 	{
 		// Reset the box to zero
@@ -1622,6 +1622,31 @@ public:
 		return processorID<Mem>(pt) == v_cl.getProcessUnitID();
 	}
 
+	/*! \brief Check if the particle is local considering boundary conditions
+	 *
+	 * \warning if the particle id outside the domain and non periodic boundary the result
+	 *          is unreliable
+	 *
+	 *
+	 * \param p object position
+	 * \param bc boundary conditions
+	 *
+	 * \return true if it is local
+	 *
+	 */
+	bool isLocalBC(const Point<dim,T> & p, const size_t (& bc)[dim]) const
+	{
+		Point<dim,T> pt = p;
+
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (bc[i] == PERIODIC)
+				pt.get(i) = openfpm::math::periodic_l(p[i],domain.getHigh(i),domain.getLow(i));
+		}
+
+		return processorID(pt) == v_cl.getProcessUnitID();
+	}
+
 	/*! \brief Get the domain Cells
 	 *
 	 * It return all the cells-id that are inside the processor-domain
@@ -1789,7 +1814,7 @@ public:
 	 * \return the Virtual cluster machine
 	 *
 	 */
-	Vcluster & getVC() const
+	Vcluster<> & getVC() const
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
diff --git a/src/Decomposition/CartDecomposition_ext.hpp b/src/Decomposition/CartDecomposition_ext.hpp
index ce5a8156..23c26f7e 100644
--- a/src/Decomposition/CartDecomposition_ext.hpp
+++ b/src/Decomposition/CartDecomposition_ext.hpp
@@ -167,7 +167,7 @@ public:
 	 * \param v_cl VCluster
 	 *
 	 */
-	CartDecomposition_ext(Vcluster & v_cl)
+	CartDecomposition_ext(Vcluster<> & v_cl)
 	:CartDecomposition<dim,T,Memory,layout_base,Distribution>(v_cl)
 	{
 	}
diff --git a/src/Decomposition/Distribution/DistParMetisDistribution.hpp b/src/Decomposition/Distribution/DistParMetisDistribution.hpp
index 528e163a..1f403bfa 100644
--- a/src/Decomposition/Distribution/DistParMetisDistribution.hpp
+++ b/src/Decomposition/Distribution/DistParMetisDistribution.hpp
@@ -17,7 +17,7 @@ template<unsigned int dim, typename T>
 class DistParMetisDistribution
 {
 	//! Vcluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Structure that store the cartesian grid information
 	grid_sm<dim, void> gr;
@@ -73,7 +73,7 @@ public:
 	 *
 	 * @param v_cl Vcluster to use as communication object in this class
 	 */
-	DistParMetisDistribution(Vcluster & v_cl) :
+	DistParMetisDistribution(Vcluster<> & v_cl) :
 			v_cl(v_cl), parmetis_graph(v_cl, v_cl.getProcessingUnits()), vtxdist(v_cl.getProcessingUnits() + 1), partitions(v_cl.getProcessingUnits()), v_per_proc(v_cl.getProcessingUnits())
 
 	{
diff --git a/src/Decomposition/Distribution/Distribution_unit_tests.hpp b/src/Decomposition/Distribution/Distribution_unit_tests.hpp
index 1c527d5b..ac8d35ea 100644
--- a/src/Decomposition/Distribution/Distribution_unit_tests.hpp
+++ b/src/Decomposition/Distribution/Distribution_unit_tests.hpp
@@ -59,7 +59,7 @@ BOOST_AUTO_TEST_SUITE (Distribution_test)
 
 BOOST_AUTO_TEST_CASE( Metis_distribution_test)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() != 3)
 	return;
@@ -180,7 +180,7 @@ BOOST_AUTO_TEST_CASE( Metis_distribution_test)
 
 BOOST_AUTO_TEST_CASE( Parmetis_distribution_test)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() != 3)
 	return;
@@ -286,7 +286,7 @@ BOOST_AUTO_TEST_CASE( Parmetis_distribution_test)
 
 BOOST_AUTO_TEST_CASE( DistParmetis_distribution_test)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() != 3)
 		return;
@@ -387,7 +387,7 @@ BOOST_AUTO_TEST_CASE( DistParmetis_distribution_test)
 
 BOOST_AUTO_TEST_CASE( Space_distribution_test)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() != 3)
 		return;
diff --git a/src/Decomposition/Distribution/MetisDistribution.hpp b/src/Decomposition/Distribution/MetisDistribution.hpp
index 2b27a9e3..caa5efd0 100644
--- a/src/Decomposition/Distribution/MetisDistribution.hpp
+++ b/src/Decomposition/Distribution/MetisDistribution.hpp
@@ -30,7 +30,7 @@ template<unsigned int dim, typename T>
 class MetisDistribution
 {
 	//! Vcluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Structure that store the cartesian grid information
 	grid_sm<dim, void> gr;
@@ -111,7 +111,7 @@ public:
 	 * \param v_cl vcluster
 	 *
 	 */
-	MetisDistribution(Vcluster & v_cl)
+	MetisDistribution(Vcluster<> & v_cl)
 	:v_cl(v_cl),metis_graph(gp)
 	{
 #ifdef SE_CLASS2
diff --git a/src/Decomposition/Distribution/ParMetisDistribution.hpp b/src/Decomposition/Distribution/ParMetisDistribution.hpp
index 98412620..3718ce0c 100644
--- a/src/Decomposition/Distribution/ParMetisDistribution.hpp
+++ b/src/Decomposition/Distribution/ParMetisDistribution.hpp
@@ -39,7 +39,7 @@ class ParMetisDistribution
 	bool is_distributed = false;
 
 	//! Vcluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Structure that store the cartesian grid information
 	grid_sm<dim, void> gr;
@@ -283,7 +283,7 @@ public:
 	 *
 	 * \param v_cl Vcluster to use as communication object in this class
 	 */
-	ParMetisDistribution(Vcluster & v_cl)
+	ParMetisDistribution(Vcluster<> & v_cl)
 	:is_distributed(false),v_cl(v_cl), parmetis_graph(v_cl, v_cl.getProcessingUnits()), vtxdist(v_cl.getProcessingUnits() + 1), partitions(v_cl.getProcessingUnits()), v_per_proc(v_cl.getProcessingUnits())
 	{
 	}
diff --git a/src/Decomposition/Distribution/SpaceDistribution.hpp b/src/Decomposition/Distribution/SpaceDistribution.hpp
index 26343e14..714a67b0 100644
--- a/src/Decomposition/Distribution/SpaceDistribution.hpp
+++ b/src/Decomposition/Distribution/SpaceDistribution.hpp
@@ -24,7 +24,7 @@ template<unsigned int dim, typename T>
 class SpaceDistribution
 {
 	//! Vcluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Structure that store the cartesian grid information
 	grid_sm<dim, void> gr;
@@ -42,7 +42,7 @@ public:
 	 *
 	 * \param v_cl Vcluster to use as communication object in this class
 	 */
-	SpaceDistribution(Vcluster & v_cl)
+	SpaceDistribution(Vcluster<> & v_cl)
 	:v_cl(v_cl)
 	{
 	}
diff --git a/src/Decomposition/Distribution/metis_util_unit_test.hpp b/src/Decomposition/Distribution/metis_util_unit_test.hpp
index 3dd1dc69..0b2f4d1c 100644
--- a/src/Decomposition/Distribution/metis_util_unit_test.hpp
+++ b/src/Decomposition/Distribution/metis_util_unit_test.hpp
@@ -25,7 +25,7 @@ BOOST_AUTO_TEST_SUITE( Metis_test )
 
 BOOST_AUTO_TEST_CASE( Metis_test_use)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() != 3)
 		return;
diff --git a/src/Decomposition/Distribution/parmetis_dist_util.hpp b/src/Decomposition/Distribution/parmetis_dist_util.hpp
index d9e482b4..1952ee5d 100755
--- a/src/Decomposition/Distribution/parmetis_dist_util.hpp
+++ b/src/Decomposition/Distribution/parmetis_dist_util.hpp
@@ -103,7 +103,7 @@ class DistParmetis
 	MPI_Comm comm = (MPI_Comm)NULL;
 
 	//! VCluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Process rank information
 	int p_id = 0;
@@ -178,7 +178,7 @@ public:
 	 * \param nc number of partitions
 	 *
 	 */
-	DistParmetis(Vcluster & v_cl, size_t nc) :
+	DistParmetis(Vcluster<> & v_cl, size_t nc) :
 			v_cl(v_cl), nc(nc)
 	{
 		// TODO Move into VCluster
diff --git a/src/Decomposition/Distribution/parmetis_util.hpp b/src/Decomposition/Distribution/parmetis_util.hpp
index 68fbf891..abe33bc1 100755
--- a/src/Decomposition/Distribution/parmetis_util.hpp
+++ b/src/Decomposition/Distribution/parmetis_util.hpp
@@ -108,7 +108,7 @@ class Parmetis
 	MPI_Comm comm = (MPI_Comm)NULL;
 
 	//! VCluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Process rank information
 	int p_id = 0;
@@ -210,7 +210,7 @@ public:
 	 * \param nc number of partitions
 	 *
 	 */
-	Parmetis(Vcluster & v_cl, size_t nc)
+	Parmetis(Vcluster<> & v_cl, size_t nc)
 	:v_cl(v_cl), nc(nc),n_dec(0)
 	{
 #ifdef SE_CLASS1
diff --git a/src/Decomposition/ORB.hpp b/src/Decomposition/ORB.hpp
index e9422298..cf9582d5 100755
--- a/src/Decomposition/ORB.hpp
+++ b/src/Decomposition/ORB.hpp
@@ -98,7 +98,7 @@ template<unsigned int dim, typename T, typename loc_wg=openfpm::vector<float>, t
 class ORB
 {
 	// Virtual cluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	// particle coordinate accumulator
 	openfpm::vector<T> cm;
diff --git a/src/Decomposition/cuda/CartDecomposition_gpu.cuh b/src/Decomposition/cuda/CartDecomposition_gpu.cuh
index ed9e5a52..a6592116 100644
--- a/src/Decomposition/cuda/CartDecomposition_gpu.cuh
+++ b/src/Decomposition/cuda/CartDecomposition_gpu.cuh
@@ -20,6 +20,7 @@ __global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts,
 	Point<3,float> xp = parts.template get<0>(p);
 
 	int pr = cdg.processorIDBC(xp);
+
 	output.template get<1>(p) = (pr == rank)?-1:pr;
 	output.template get<0>(p) = p;
 }
@@ -55,6 +56,8 @@ __device__ __host__ inline int processorID_impl(T2 & p, fine_s_type & fine_s, vs
 
 #endif
 
+
+
 	return sub_domains_global.template get<1>(e);
 }
 
diff --git a/src/Decomposition/cuda/decomposition_cuda_tests.cu b/src/Decomposition/cuda/decomposition_cuda_tests.cu
index acc21795..b8322092 100644
--- a/src/Decomposition/cuda/decomposition_cuda_tests.cu
+++ b/src/Decomposition/cuda/decomposition_cuda_tests.cu
@@ -14,7 +14,7 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
 	auto & v_cl = create_vcluster();
 
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	CartDecomposition<3, float, CudaMemory, memory_traits_inte> dec(vcl);
 
diff --git a/src/Decomposition/dec_optimizer_unit_test.hpp b/src/Decomposition/dec_optimizer_unit_test.hpp
index eeaf821c..c23b33c6 100644
--- a/src/Decomposition/dec_optimizer_unit_test.hpp
+++ b/src/Decomposition/dec_optimizer_unit_test.hpp
@@ -135,7 +135,7 @@ BOOST_AUTO_TEST_CASE( dec_optimizer_test_use_p)
 BOOST_AUTO_TEST_CASE( dec_optimizer_disconnected_subdomains_np)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	// Test for only 3 processors
 	if (vcl.getProcessingUnits() != 3)
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index 853dd580..ed5a3341 100755
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -140,7 +140,7 @@ class ie_ghost
 	* \note To an explanation about the sectors see getShiftVectors
 	*
 	*/
-	inline size_t ebx_ibx_form(size_t k, size_t b, size_t p_id, const comb<dim> & c ,size_t N_b, Vcluster & v_cl, const bool ei)
+	inline size_t ebx_ibx_form(size_t k, size_t b, size_t p_id, const comb<dim> & c ,size_t N_b, Vcluster<> & v_cl, const bool ei)
 	{
 		comb<dim> cext = c;
 
@@ -192,7 +192,7 @@ protected:
 	 * \see calculateGhostBoxes
 	 *
 	 */
-	void create_box_nn_processor_ext(Vcluster & v_cl,
+	void create_box_nn_processor_ext(Vcluster<> & v_cl,
 			                         Ghost<dim,T> & ghost,
 									 openfpm::vector<SpaceBox<dim,T>> & sub_domains,
 									 const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor,
@@ -293,7 +293,7 @@ protected:
 	 * \see calculateGhostBoxes
 	 *
 	 */
-	void create_box_nn_processor_int(Vcluster & v_cl,
+	void create_box_nn_processor_int(Vcluster<> & v_cl,
 			                         Ghost<dim,T> & ghost,
 									 openfpm::vector<SpaceBox<dim,T>> & sub_domains,
 									 const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor,
diff --git a/src/Decomposition/nn_processor_unit_test.hpp b/src/Decomposition/nn_processor_unit_test.hpp
index 6f20ccb9..e7e6290f 100644
--- a/src/Decomposition/nn_processor_unit_test.hpp
+++ b/src/Decomposition/nn_processor_unit_test.hpp
@@ -12,7 +12,7 @@
 
 void create_decomposition2x2(openfpm::vector<openfpm::vector<long unsigned int>> & box_nn_processor, openfpm::vector<SpaceBox<2,float>> & sub_domains)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	box_nn_processor.add();
 
@@ -54,7 +54,7 @@ BOOST_AUTO_TEST_SUITE( nn_processor_test )
 
 BOOST_AUTO_TEST_CASE( nn_processor_np_test)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	/*!
 	 *
@@ -202,7 +202,7 @@ BOOST_AUTO_TEST_CASE( nn_processor_np_test)
 BOOST_AUTO_TEST_CASE( nn_processor_box_periodic_test)
 {
 	// Vcluster
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	/*!
 	 *
diff --git a/src/Decomposition/tests/CartDecomposition_unit_test.cpp b/src/Decomposition/tests/CartDecomposition_unit_test.cpp
index 0b03f249..cd53d612 100755
--- a/src/Decomposition/tests/CartDecomposition_unit_test.cpp
+++ b/src/Decomposition/tests/CartDecomposition_unit_test.cpp
@@ -55,7 +55,7 @@ void setComputationCosts3D(CartDecomposition<3, float> &dec, size_t n_v, Point<3
 BOOST_AUTO_TEST_CASE( CartDecomposition_non_periodic_test)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	CartDecomposition<3, float> dec(vcl);
 
@@ -144,7 +144,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_non_periodic_test)
 BOOST_AUTO_TEST_CASE( CartDecomposition_periodic_test)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	//! [Create CartDecomposition]
 	CartDecomposition<3, float> dec(vcl);
@@ -239,7 +239,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_periodic_test)
 BOOST_AUTO_TEST_CASE( CartDecomposition_ext_non_periodic_test)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	CartDecomposition<3,float> dec(vcl);
 
@@ -344,7 +344,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_ext_non_periodic_test)
 BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idbc_and_ghost )
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	if (vcl.size() != 3)
 	{return;}
@@ -392,7 +392,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idbc_and_ghost2 )
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	CartDecomposition<3, double> dec(vcl);
 
@@ -488,7 +488,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
 BOOST_AUTO_TEST_CASE( CartDecomposition_non_periodic_test_dist_grid)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	CartDecomposition<3, float> dec(vcl);
 
diff --git a/src/Graph/DistGraphFactory.hpp b/src/Graph/DistGraphFactory.hpp
index f724dac4..cdd8c508 100755
--- a/src/Graph/DistGraphFactory.hpp
+++ b/src/Graph/DistGraphFactory.hpp
@@ -240,7 +240,7 @@ public:
 	//! Construct Cartesian graph
 	static Graph construct(const size_t (&sz)[dim], Box<dim, T> dom)
 	{
-		Vcluster &v_cl = create_vcluster();
+		Vcluster<> &v_cl = create_vcluster();
 
 		// Calculate the size of the hyper-cubes on each dimension
 		T szd[dim];
@@ -394,7 +394,7 @@ public:
 	//! Construct Cartesian graph
 	static Graph construct(const size_t (&sz)[dim], Box<dim, T> dom)
 	{
-		Vcluster &v_cl = create_vcluster();
+		Vcluster<> &v_cl = create_vcluster();
 
 		// Calculate the size of the hyper-cubes on each dimension
 
diff --git a/src/Graph/dist_map_graph.hpp b/src/Graph/dist_map_graph.hpp
index f400aed3..d103b4f7 100644
--- a/src/Graph/dist_map_graph.hpp
+++ b/src/Graph/dist_map_graph.hpp
@@ -209,7 +209,7 @@ template<typename V, typename E = no_edge,
 class DistGraph_CSR
 {
 	//! Vcluster communication object
-	Vcluster & vcl;
+	Vcluster<> & vcl;
 
 	//! Distribution vector
 	openfpm::vector<idx_t> vtxdist;
@@ -1147,7 +1147,7 @@ public:
 	 * \param gg distributed graph to copy
 	 *
 	 */
-	DistGraph_CSR(Vcluster & vcl, DistGraph_CSR<V, E, Memory> && g) :
+	DistGraph_CSR(Vcluster<> & vcl, DistGraph_CSR<V, E, Memory> && g) :
 			vcl(vcl)
 	{
 		swap(g);
diff --git a/src/Graph/dist_map_graph_unit_test.hpp b/src/Graph/dist_map_graph_unit_test.hpp
index f761d7a1..3ad6bc7f 100644
--- a/src/Graph/dist_map_graph_unit_test.hpp
+++ b/src/Graph/dist_map_graph_unit_test.hpp
@@ -127,7 +127,7 @@ BOOST_AUTO_TEST_CASE( dist_map_graph_use)
 {
 
 	//! Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	if(vcl.getProcessingUnits() != 4)
 		return;
@@ -241,7 +241,7 @@ BOOST_AUTO_TEST_CASE( dist_map_graph_use)
 BOOST_AUTO_TEST_CASE( dist_map_graph_use_redistribution)
 {
 	//! Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	if(vcl.getProcessingUnits() != 4)
 		return;
@@ -324,7 +324,7 @@ BOOST_AUTO_TEST_CASE( dist_map_graph_use_redistribution)
 BOOST_AUTO_TEST_CASE( dist_map_graph_use_free_add)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	if(vcl.getProcessingUnits() != 4)
 		return;
@@ -480,7 +480,7 @@ BOOST_AUTO_TEST_CASE( dist_map_graph_use_free_add)
 BOOST_AUTO_TEST_CASE( dist_map_graph_use_multi_free_add)
 {
 	// Vcluster
-	Vcluster & vcl = create_vcluster();
+	Vcluster<> & vcl = create_vcluster();
 
 	if(vcl.getProcessingUnits() != 4)
 		return;
diff --git a/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp b/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
index 0c62f40e..41ff9899 100644
--- a/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
@@ -93,7 +93,7 @@ void Test2D_sub(const Box<2,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -149,7 +149,7 @@ void Test3D_decit(const Box<3,float> & domain, long int k)
 {
 	size_t k_bck = k;
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		if ( v_cl.getProcessingUnits() > 32 )
 			return;
@@ -216,7 +216,7 @@ void Test3D_decit(const Box<3,float> & domain, long int k)
 	k = k_bck;
 
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		if ( v_cl.getProcessingUnits() > 32 )
 			return;
@@ -292,7 +292,7 @@ void Test3D_stencil(const Box<3,float> & domain, long int k)
 											 {1,0,0}};
 
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		if ( v_cl.getProcessingUnits() > 32 )
 			return;
@@ -393,7 +393,7 @@ void Test3D_fast_vect(const Box<3,float> & domain, long int k)
 											 {1,0,0}};
 
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		if ( v_cl.getProcessingUnits() > 32 )
 			return;
@@ -478,7 +478,7 @@ void Test3D_fast_vect(const Box<3,float> & domain, long int k)
 void Test3D_decskinit(const Box<3,float> & domain, long int k)
 {
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		if ( v_cl.getProcessingUnits() > 32 )
 			return;
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index fab6c7e6..7aaacaed 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -102,7 +102,7 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 	CellDecomposer_sm<dim,St,shift<dim,St>> cd_sm;
 
 	//! Communicator class
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! properties names
 	openfpm::vector<std::string> prp_names;
@@ -1268,7 +1268,7 @@ public:
 	 * \return the Virtual cluster machine
 	 *
 	 */
-	Vcluster & getVC()
+	Vcluster<> & getVC()
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp
index 5533a3ca..dae22b2f 100644
--- a/src/Grid/grid_dist_id_comm.hpp
+++ b/src/Grid/grid_dist_id_comm.hpp
@@ -177,7 +177,7 @@ template<unsigned int dim, typename St, typename T, typename Decomposition = Car
 class grid_dist_id_comm
 {
 	//! VCluster
-	Vcluster & v_cl;
+	Vcluster<> & v_cl;
 
 	//! Maps the processor id with the communication request into map procedure
 	openfpm::vector<size_t> p_map_req;
diff --git a/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp b/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
index 929b2b7f..6275c92f 100644
--- a/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
+++ b/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp
@@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_save_test )
 	// Domain
 	Box<2,float> domain({0.0,0.0},{1.0,1.0});
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	// Skip this test on big scale
 	if (v_cl.getProcessingUnits() >= 32)
@@ -91,7 +91,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test )
 	// Domain
 	Box<2,float> domain({0.0,0.0},{1.0,1.0});
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	// Skip this test on big scale
 	if (v_cl.getProcessingUnits() >= 32)
diff --git a/src/Grid/tests/grid_dist_id_unit_test.cpp b/src/Grid/tests/grid_dist_id_unit_test.cpp
index 50a34576..6c077f7b 100644
--- a/src/Grid/tests/grid_dist_id_unit_test.cpp
+++ b/src/Grid/tests/grid_dist_id_unit_test.cpp
@@ -20,7 +20,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_domain_grid_unit_converter3D_test)
 	// Domain
 	Box<3,float> domain({-0.3,-0.3,-0.3},{1.0,1.0,1.0});
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	// Skip this test on big scale
 	if (v_cl.getProcessingUnits() >= 32)
@@ -103,7 +103,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_domain_grid_unit_converter_test)
 	// Domain
 	Box<2,float> domain({0.0,0.0},{1.0,1.0});
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	// Skip this test on big scale
 	if (v_cl.getProcessingUnits() >= 32)
@@ -212,7 +212,7 @@ void Test2D(const Box<2,float> & domain, long int k)
 		//! [Create and access a distributed grid]
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -275,7 +275,7 @@ void Test2D(const Box<2,float> & domain, long int k)
 
 void Test1D(const Box<1,float> & domain, long int k)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 	long int big_step = k / 30;
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
@@ -332,7 +332,7 @@ void Test1D(const Box<1,float> & domain, long int k)
 		//! [Create and access a distributed grid]
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -452,7 +452,7 @@ void Test3D_sub(const Box<3,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -557,7 +557,7 @@ void Test3D(const Box<3,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -845,7 +845,7 @@ void Test2D_complex(const Box<2,float> & domain, long int k)
 		//! [Create and access a distributed grid complex]
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -1007,7 +1007,7 @@ void Test3D_complex(const Box<3,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -1103,7 +1103,7 @@ void Test3D_dup(const Box<3,float> & domain, long int k)
 	long int small_step = 21;
 	long int k_old = k;
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
@@ -1202,7 +1202,7 @@ void Test3D_dup(const Box<3,float> & domain, long int k)
 
 void Test3D_periodic(const Box<3,float> & domain, long int k)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
@@ -1274,7 +1274,7 @@ void Test3D_periodic(const Box<3,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -1359,7 +1359,7 @@ void Test3D_periodic(const Box<3,float> & domain, long int k)
 
 void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
@@ -1479,7 +1479,7 @@ void Test_grid_copy(const Box<3,float> & domain, long int k)
 {
 	typedef Point_test<float> p;
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
@@ -1588,7 +1588,7 @@ void Test_grid_copy(const Box<3,float> & domain, long int k)
 void Test_ghost_correction(Box<3,double> & domain, long int k, long int g_)
 {
 	size_t sz[3] = {(size_t)k,(size_t)k,(size_t)k};
-	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	periodicity<3> bc = {{PERIODIC,PERIODIC,PERIODIC}};
 
 	Ghost<3,long int> g(g_);
 
@@ -1872,7 +1872,7 @@ BOOST_AUTO_TEST_CASE ( grid_basic_functions )
 	{return;}
 
 	size_t sz[2] = {(size_t)8,(size_t)8};
-	periodicity<2> bc = {PERIODIC,PERIODIC};
+	periodicity<2> bc = {{PERIODIC,PERIODIC}};
 
 	Ghost<2,long int> g(1);
 	Box<2,double> domain({-1.0,-1.0},{1.0,1.0});
@@ -1893,7 +1893,7 @@ BOOST_AUTO_TEST_CASE ( grid_overflow_round_off_error )
 
     size_t sz[2] = {numGridPoint,numGridPoint};
 
-    periodicity<2> bc = {PERIODIC,PERIODIC};
+    periodicity<2> bc = {{PERIODIC,PERIODIC}};
 
     Ghost<2,double> g(3.0*(domain.getHigh(0) - domain.getLow(0))/numGridPoint + 0.001);
 
diff --git a/src/Grid/tests/grid_dist_id_unit_test_ext_dom.hpp b/src/Grid/tests/grid_dist_id_unit_test_ext_dom.hpp
index dcd7dbbd..add6c61d 100644
--- a/src/Grid/tests/grid_dist_id_unit_test_ext_dom.hpp
+++ b/src/Grid/tests/grid_dist_id_unit_test_ext_dom.hpp
@@ -17,7 +17,7 @@ void Test3D_extended_grid(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
diff --git a/src/Grid/tests/grid_dist_id_unit_test_unb_ghost.hpp b/src/Grid/tests/grid_dist_id_unit_test_unb_ghost.hpp
index 36551eda..f4b45aa0 100644
--- a/src/Grid/tests/grid_dist_id_unit_test_unb_ghost.hpp
+++ b/src/Grid/tests/grid_dist_id_unit_test_unb_ghost.hpp
@@ -59,7 +59,7 @@ void Test3D_unb_ghost(const Box<3,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
@@ -118,7 +118,7 @@ void Test3D_unb_ghost(const Box<3,float> & domain, long int k)
 // Test grid periodic
 void Test3D_unb_ghost_periodic(const Box<3,float> & domain, long int k)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if ( v_cl.getProcessingUnits() > 24 )
 		return;
@@ -187,7 +187,7 @@ void Test3D_unb_ghost_periodic(const Box<3,float> & domain, long int k)
 		}
 
 		// Get the virtual cluster machine
-		Vcluster & vcl = g_dist.getVC();
+		Vcluster<> & vcl = g_dist.getVC();
 
 		// reduce
 		vcl.sum(count);
diff --git a/src/Makefile.am b/src/Makefile.am
index 18cd8fee..70e6b590 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -3,14 +3,21 @@ LINKLIBS = $(HDF5_LDFLAGS)  $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB)  $(
 FLAGS_NVCC = -Xcudafe "--display_error_number --diag_suppress=2885 --diag_suppress=2887  --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111"  $(NVCCFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS)  $(MPI_INC_PATH) $(PETSC_INCLUDE) $(LIBHILBERT_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE)  -g --expt-extended-lambda
 
 noinst_PROGRAMS = pdata actual_test
-pdata_SOURCES = main.cpp Vector/cuda/vector_dist_cuda_func_test.cu Decomposition/cuda/decomposition_cuda_tests.cu Vector/vector_dist_gpu_unit_tests.cu  Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp Grid/tests/grid_dist_id_unit_test.cpp Grid/tests/staggered_grid_dist_unit_test.cpp Vector/tests/vector_dist_cell_list_tests.cpp Vector/tests/vector_dist_complex_prp_unit_test.cpp Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp Vector/tests/vector_dist_MP_unit_tests.cpp Vector/tests/vector_dist_NN_tests.cpp Vector/tests/vector_dist_unit_test.cpp  pdata_performance.cpp Decomposition/tests/CartDecomposition_unit_test.cpp Decomposition/tests/shift_vect_converter_tests.cpp Vector/performance/vector_dist_performance_util.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
+if BUILDCUDA
+pdata_SOURCES = initialize/initialize_wrapper_cuda.cu
+actual_test_SOURCES = initialize/initialize_wrapper_cuda.cu
+else
+pdata_SOURCES = initialize/initialize_wrapper_cpu.cpp
+actual_test_SOURCES = initialize/initialize_wrapper_cpu.cpp
+endif
+pdata_SOURCES += main.cpp Vector/cuda/vector_dist_cuda_func_test.cu Decomposition/cuda/decomposition_cuda_tests.cu Vector/cuda/vector_dist_gpu_unit_tests.cu Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp Grid/tests/grid_dist_id_unit_test.cpp Grid/tests/staggered_grid_dist_unit_test.cpp Vector/tests/vector_dist_cell_list_tests.cpp Vector/tests/vector_dist_complex_prp_unit_test.cpp Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp Vector/tests/vector_dist_MP_unit_tests.cpp Vector/tests/vector_dist_NN_tests.cpp Vector/tests/vector_dist_unit_test.cpp  pdata_performance.cpp Decomposition/tests/CartDecomposition_unit_test.cpp Decomposition/tests/shift_vect_converter_tests.cpp Vector/performance/vector_dist_performance_util.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 pdata_CXXFLAGS = -Wno-unknown-pragmas $(BOOST_CPPFLAGS) $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(H5PART_INCLUDE) -DPARALLEL_IO  -Wno-unused-local-typedefs
 pdata_CFLAGS = $(CUDA_CFLAGS)
 pdata_LDADD = $(LINKLIBS) -lparmetis -lmetis
 
-actual_test_SOURCES = Vector/cuda/vector_dist_cuda_func_test.cu Vector/vector_dist_gpu_unit_tests.cu vector_ main_single.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
+actual_test_SOURCES = Vector/cuda/vector_dist_cuda_func_test.cu Vector/cuda/vector_dist_gpu_unit_tests.cu vector_ main_single.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 actual_test_CXXFLAGS = -Wno-unknown-pragmas $(BOOST_CPPFLAGS) $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(H5PART_INCLUDE) -DPARALLEL_IO  -Wno-unused-local-typedefs
-actual_test_CFLAGS = $(CUDA_CFLAGS)
+actual_test_CFLAGS = $(CUDA_CFLAGS) -IDIOCANE
 actual_test_LDADD = $(LINKLIBS) -lparmetis -lmetis
 
 
diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu
index 3692e45a..6b3454e7 100644
--- a/src/Vector/cuda/vector_dist_cuda_func_test.cu
+++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu
@@ -9,13 +9,16 @@ BOOST_AUTO_TEST_SUITE( vector_dist_gpu_util_func_test )
 
 BOOST_AUTO_TEST_CASE( vector_dist_gpu_find_buffer_offsets_test )
 {
-	openfpm::vector_gpu<aggregate<int>> vgp;
+	openfpm::vector_gpu<aggregate<int,int>> vgp;
 	openfpm::vector_gpu<aggregate<int,int>> offs;
 
 	vgp.resize(200000);
 
 	for (size_t k = 0 ; k < vgp.size() ; k++)
-	{vgp.template get<0>(k) = k / 1000;}
+	{
+		vgp.template get<0>(k) = k / 1000;
+		vgp.template get<1>(k) = k / 1000;
+	}
 
 	offs.resize(220);
 
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index bc7ff039..7c3b3c6d 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -17,11 +17,11 @@ __global__  void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
 
     if (p >= vd.size() - 1) return;
 
-    if (vd.template get<0>(p) != vd.template get<0>(p+1))
+    if (vd.template get<1>(p) != vd.template get<1>(p+1))
 	{
     	int i = atomicAdd(cnt, 1);
     	offs.template get<0>(i) = p+1;
-    	offs.template get<1>(i) = vd.template get<0>(p);
+    	offs.template get<1>(i) = vd.template get<1>(p);
 	}
 }
 
diff --git a/src/Vector/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
similarity index 82%
rename from src/Vector/vector_dist_gpu_unit_tests.cu
rename to src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index fd107599..26dbe011 100644
--- a/src/Vector/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -3,7 +3,7 @@
 #include <boost/test/unit_test.hpp>
 #include "VCluster/VCluster.hpp"
 #include <Vector/vector_dist.hpp>
-
+#include "Vector/tests/vector_dist_util_unit_tests.hpp"
 
 BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
 
@@ -403,13 +403,97 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
 		vd.getPos(p)[1] = (float)rand() / RAND_MAX;
 		vd.getPos(p)[2] = (float)rand() / RAND_MAX;
 
+		vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
+
+		vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
+		vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
+		vd.template getProp<1>(p)[2] = vd.getPos(p)[2];
+
+		vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
+		vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
+		vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
+
 		++it;
 	}
 
+	// move on device
+	vd.hostToDevicePos();
+	vd.hostToDeviceProp<0,1,2>();
+
 	// Ok we redistribute the particles (GPU based)
 	vd.map(MAP_ON_DEVICE);
 
+	// Reset the host part
+
+	auto it3 = vd.getDomainIterator();
+
+	while (it3.isNext())
+	{
+		auto p = it3.get();
+
+		vd.getPos(p)[0] = 1.0;
+		vd.getPos(p)[1] = 1.0;
+		vd.getPos(p)[2] = 1.0;
+
+		vd.template getProp<0>(p) = 0.0;
+
+		vd.template getProp<0>(p) = 0.0;
+		vd.template getProp<0>(p) = 0.0;
+		vd.template getProp<0>(p) = 0.0;
+
+		vd.template getProp<0>(p) = 0.0;
+		vd.template getProp<0>(p) = 0.0;
+		vd.template getProp<0>(p) = 0.0;
+
+		++it3;
+	}
+
+	// we move from Device to CPU
+
+	vd.deviceToHostPos();
+	vd.deviceToHostProp<0,1,2>();
+
+	// Check
+
+	auto it2 = vd.getDomainIterator();
+
+	bool match = true;
+	while (it2.isNext())
+	{
+		auto p = it2.get();
+
+		match &= vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
+
+		match &= vd.template getProp<1>(p)[0] == vd.getPos(p)[0];
+		match &= vd.template getProp<1>(p)[1] == vd.getPos(p)[1];
+		match &= vd.template getProp<1>(p)[2] == vd.getPos(p)[2];
+
+		match &= vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1];
+		match &= vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2];
+		match &= vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2];
+
+		++it2;
+	}
+
+	BOOST_REQUIRE_EQUAL(match,true);
+
+	// count local particles
+
+	size_t l_cnt = 0;
+	size_t nl_cnt = 0;
+	size_t n_out = 0;
+
+	// Domain + ghost box
+	Box<3,float> dom_ext = domain;
+	dom_ext.enlarge(g);
+
+	auto it5 = vd.getDomainIterator();
+	count_local_n_local<3>(vd,it5,bc,domain,dom_ext,l_cnt,nl_cnt,n_out);
+
+	BOOST_REQUIRE_EQUAL(n_out,0);
+	BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
 
+	vd.write("gpu_write_test");
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Vector/performance/vector_dist_performance_common.hpp b/src/Vector/performance/vector_dist_performance_common.hpp
index e6e0d261..1b7240c0 100644
--- a/src/Vector/performance/vector_dist_performance_common.hpp
+++ b/src/Vector/performance/vector_dist_performance_common.hpp
@@ -136,7 +136,7 @@ template<unsigned int dim, unsigned int prp, typename T, typename V> void cross_
  * \param v_cl Global vcluster
  * \param k_int Number of particles
  */
-template<unsigned int dim, typename v_dist> void vd_initialize(v_dist & vd, Vcluster & v_cl, size_t k_int)
+template<unsigned int dim, typename v_dist> void vd_initialize(v_dist & vd, Vcluster<> & v_cl, size_t k_int)
 {
 	// The random generator engine
 	std::default_random_engine eg(v_cl.getProcessUnitID()*4313);
@@ -166,7 +166,7 @@ template<unsigned int dim, typename v_dist> void vd_initialize(v_dist & vd, Vclu
  * \param v_cl Global vcluster
  * \param k_int Number of particles
  */
-template<unsigned int dim, typename v_dist> void vd_initialize_double(v_dist & vd,v_dist & vd2, Vcluster & v_cl, size_t k_int)
+template<unsigned int dim, typename v_dist> void vd_initialize_double(v_dist & vd,v_dist & vd2, Vcluster<> & v_cl, size_t k_int)
 {
 	// The random generator engine
 	std::default_random_engine eg(v_cl.getProcessUnitID()*4313);
diff --git a/src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp b/src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
index a843c42e..0edef0f5 100644
--- a/src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
+++ b/src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
@@ -120,7 +120,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_hdf5_load_test )
 {
 #ifndef SE_CLASS3
 
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	Box<dim,float> box;
 
diff --git a/src/Vector/tests/vector_dist_NN_tests.cpp b/src/Vector/tests/vector_dist_NN_tests.cpp
index d233589a..f947c164 100644
--- a/src/Vector/tests/vector_dist_NN_tests.cpp
+++ b/src/Vector/tests/vector_dist_NN_tests.cpp
@@ -16,7 +16,7 @@ extern void print_test_v(std::string test, size_t sz);
 template<typename VerletList>
 void test_full_nn(long int k)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 12)
 		return;
@@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
 
 BOOST_AUTO_TEST_CASE( vector_dist_particle_iteration )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 12)
 		return;
@@ -321,7 +321,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_iteration )
 
 BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_update_with_limit )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 12)
 		return;
@@ -392,7 +392,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_update_with_limit )
 
 BOOST_AUTO_TEST_CASE( vector_dist_particle_getCellListSym_with_div )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 12)
 		return;
diff --git a/src/Vector/tests/vector_dist_cell_list_tests.cpp b/src/Vector/tests/vector_dist_cell_list_tests.cpp
index 454d96a7..7ccdd523 100644
--- a/src/Vector/tests/vector_dist_cell_list_tests.cpp
+++ b/src/Vector/tests/vector_dist_cell_list_tests.cpp
@@ -19,7 +19,7 @@ extern long int decrement(long int k, long int step);
 
 void test_reorder_sfc(reorder_opt opt)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 48)
 		return;
@@ -101,7 +101,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_reorder_2d_test )
 
 BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_hilb_forces_test )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 48)
 		return;
@@ -225,7 +225,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_hilb_forces_test )
 
 BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_reorder_forces_test )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 48)
 		return;
@@ -352,7 +352,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_reorder_forces_test )
 
 BOOST_AUTO_TEST_CASE( vector_dist_symmetric_cell_list )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -562,7 +562,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_cell_list )
 
 BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_cell_list )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -789,7 +789,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_cell_list )
 template<typename VerletList>
 void test_vd_symmetric_verlet_list()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -998,7 +998,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 template<typename VerletList>
 void vector_sym_verlet_list_nb()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -1410,7 +1410,7 @@ template<typename VerletList, typename part_prop> void test_crs_full(vector_dist
 template<typename VerletList>
 void test_csr_verlet_list()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -1470,7 +1470,7 @@ void test_csr_verlet_list()
 template<typename VerletList>
 void test_csr_verlet_list_override()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -1558,7 +1558,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_dec_override )
 template <typename VerletList>
 void test_vd_symmetric_crs_verlet()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -1657,7 +1657,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_partit )
 
 BOOST_AUTO_TEST_CASE( vector_dist_checking_unloaded_processors )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -1746,7 +1746,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_checking_unloaded_processors )
 
 BOOST_AUTO_TEST_CASE( vector_dist_cell_list_multi_type )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 		return;
@@ -1858,7 +1858,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_list_multi_type )
 
 BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 24)
 	{return;}
diff --git a/src/Vector/tests/vector_dist_complex_prp_unit_test.cpp b/src/Vector/tests/vector_dist_complex_prp_unit_test.cpp
index 8d2956ba..56360121 100644
--- a/src/Vector/tests/vector_dist_complex_prp_unit_test.cpp
+++ b/src/Vector/tests/vector_dist_complex_prp_unit_test.cpp
@@ -16,7 +16,7 @@ extern long int decrement(long int k, long int step);
 
 BOOST_AUTO_TEST_CASE( vector_dist_periodic_complex_prp_test_use_3d )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 48)
 		return;
diff --git a/src/Vector/tests/vector_dist_unit_test.cpp b/src/Vector/tests/vector_dist_unit_test.cpp
index 5a4e2fc3..c6508999 100644
--- a/src/Vector/tests/vector_dist_unit_test.cpp
+++ b/src/Vector/tests/vector_dist_unit_test.cpp
@@ -60,7 +60,7 @@ long int decrement(long int k, long int step)
 template<unsigned int dim, template <typename> class layout>
 size_t total_n_part_lc(vector_dist<dim,float, Point_test<float>, CartDecomposition<dim,float>, HeapMemory, layout > & vd, size_t (& bc)[dim])
 {
-	Vcluster & v_cl = vd.getVC();
+	Vcluster<> & v_cl = vd.getVC();
 	auto it2 = vd.getDomainIterator();
 	const CartDecomposition<3,float> & ct = vd.getDecomposition();
 
@@ -100,7 +100,7 @@ template<typename vector>
 void Test2D_ghost(Box<2,float> & box)
 {
 	// Communication object
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	typedef Point_test<float> p;
 
@@ -291,7 +291,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_inte )
 
 BOOST_AUTO_TEST_CASE( vector_dist_iterator_test_use_2d )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
     // set the seed
 	// create the random generator engine
@@ -366,7 +366,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_iterator_test_use_2d )
 
 BOOST_AUTO_TEST_CASE( vector_dist_iterator_test_use_3d )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
     // set the seed
 	// create the random generator engine
@@ -443,7 +443,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_iterator_test_use_3d )
 
 BOOST_AUTO_TEST_CASE( vector_dist_iterator_fixed_dec_3d )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
     // set the seed
 	// create the random generator engine
@@ -520,7 +520,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_iterator_fixed_dec_3d )
 
 BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_use_2d )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
     // set the seed
 	// create the random generator engine
@@ -628,7 +628,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_use_2d )
 
 BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_use_3d )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
     // set the seed
 	// create the random generator engine
@@ -734,7 +734,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_use_3d )
 
 void test_random_walk(size_t opt)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
     // set the seed
 	// create the random generator engine
@@ -925,7 +925,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_not_periodic_map )
 
 BOOST_AUTO_TEST_CASE( vector_dist_out_of_bound_policy )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 8)
 		return;
@@ -985,7 +985,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_out_of_bound_policy )
 
 void Test_interacting(Box<3,float> & box)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 8)
 		return;
@@ -1150,7 +1150,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_grid_iterator )
 	// 3D test
 	for ( ; k > 8*big_step ; k-= (k > 2*big_step)?big_step:small_step )
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		const size_t Ng = k;
 
@@ -1219,7 +1219,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_verlet_test )
 	// 3D test
 	for ( ; k > 8*big_step ; k-= (k > 2*big_step)?big_step:small_step )
 	{
-		Vcluster & v_cl = create_vcluster();
+		Vcluster<> & v_cl = create_vcluster();
 
 		const size_t Ng = k;
 
@@ -1328,7 +1328,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_verlet_test )
 
 BOOST_AUTO_TEST_CASE( vector_dist_periodic_map_list )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 3)
 		return;
@@ -1452,7 +1452,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_map_list )
 
 BOOST_AUTO_TEST_CASE( vector_dist_ghost_with_ghost_buffering )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 3)
 		return;
@@ -1634,7 +1634,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_with_ghost_buffering )
 
 BOOST_AUTO_TEST_CASE( vector_dist_ghost_put )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	long int k = 25*25*25*create_vcluster().getProcessingUnits();
 	k = std::pow(k, 1/3.);
@@ -1824,7 +1824,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put )
 
 BOOST_AUTO_TEST_CASE( vector_fixing_noposition_and_keep_prop )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 48)
 		return;
@@ -1888,7 +1888,7 @@ BOOST_AUTO_TEST_CASE( vector_fixing_noposition_and_keep_prop )
 
 BOOST_AUTO_TEST_CASE( vector_of_vector_dist )
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 48)
 		return;
diff --git a/src/Vector/tests/vector_dist_util_unit_tests.hpp b/src/Vector/tests/vector_dist_util_unit_tests.hpp
index f830e13f..3995a7cf 100644
--- a/src/Vector/tests/vector_dist_util_unit_tests.hpp
+++ b/src/Vector/tests/vector_dist_util_unit_tests.hpp
@@ -23,7 +23,7 @@
  */
 template<unsigned int dim,typename vector_dist> inline void count_local_n_local(vector_dist & vd, vector_dist_iterator & it, size_t (& bc)[dim] , Box<dim,float> & box, Box<dim,float> & dom_ext, size_t & l_cnt, size_t & nl_cnt, size_t & n_out)
 {
-	const CartDecomposition<dim,float> & ct = vd.getDecomposition();
+	auto & ct = vd.getDecomposition();
 
 	while (it.isNext())
 	{
@@ -31,8 +31,10 @@ template<unsigned int dim,typename vector_dist> inline void count_local_n_local(
 		// Check if it is in the domain
 		if (box.isInsideNP(vd.getPos(key)) == true)
 		{
+			Point<dim,typename vector_dist::stype> xp = vd.getPos(key);
+
 			// Check if local
-			if (ct.isLocalBC(vd.getPos(key),bc) == true)
+			if (ct.isLocalBC(xp,bc) == true)
 				l_cnt++;
 			else
 				nl_cnt++;
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 8112a98e..05f15a77 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -234,7 +234,7 @@ private:
 #endif
 
 	//! Virtual cluster
-	Vcluster & v_cl;
+	Vcluster<Memory> & v_cl;
 
 	//! option used to create this vector
 	size_t opt = 0;
@@ -453,7 +453,7 @@ public:
 	 *
 	 */
 	vector_dist(const Decomposition & dec, size_t np) :
-	vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base>(dec), v_cl(create_vcluster()) SE_CLASS3_VDIST_CONSTRUCTOR
+	vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base>(dec), v_cl(create_vcluster<Memory>()) SE_CLASS3_VDIST_CONSTRUCTOR
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,VECTOR_DIST_EVENT,4);
@@ -480,7 +480,7 @@ public:
 	 *
 	 */
 	vector_dist(size_t np, Box<dim, St> box, const size_t (&bc)[dim], const Ghost<dim, St> & g, size_t opt = 0, const grid_sm<dim,void> & gdist = grid_sm<dim,void>())
-	:v_cl(create_vcluster()),opt(opt) SE_CLASS3_VDIST_CONSTRUCTOR
+	:v_cl(create_vcluster<Memory>()),opt(opt) SE_CLASS3_VDIST_CONSTRUCTOR
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,VECTOR_DIST_EVENT,4);
@@ -2265,7 +2265,7 @@ public:
 	 *
 	 */
 
-	Vcluster & getVC()
+	Vcluster<> & getVC()
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -2496,7 +2496,7 @@ public:
 		 */
 		template<unsigned int ... prp> void hostToDeviceProp()
 		{
-			v_prp.template deviceToHost<prp ...>();
+			v_prp.template hostToDevice<prp ...>();
 		}
 
 		/*! \brief Move the memory from the device to host memory
@@ -2523,6 +2523,6 @@ public:
 };
 
 
-template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St>> using vector_dist_gpu = vector_dist<dim,St,prop,Decomposition,CudaMemory,memory_traits_inte>;
+template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St,CudaMemory,memory_traits_inte>> using vector_dist_gpu = vector_dist<dim,St,prop,Decomposition,CudaMemory,memory_traits_inte>;
 
 #endif /* VECTOR_HPP_ */
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index c468c915..4419fa2c 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -67,7 +67,7 @@ class vector_dist_comm
 	typedef openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> send_pos_vector;
 
 	//! VCluster
-	Vcluster & v_cl;
+	Vcluster<Memory> & v_cl;
 
 	//! Domain decomposition
 	Decomposition dec;
@@ -90,6 +90,12 @@ class vector_dist_comm
 	//! particles that must be communicated to the other processors
 	openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart;
 
+	//! Helper buffer for computation (on GPU) of local particles (position)
+	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos_tmp;
+
+	//! Helper buffer for computation (on GPU) of local particles (properties)
+	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp_tmp;
+
 	//! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size()
 	openfpm::vector<size_t> g_opart_sz;
 
@@ -160,10 +166,15 @@ class vector_dist_comm
 	{
 		if (opt & MAP_ON_DEVICE)
 		{
-			for (size_t i = 0; i < prc_sz.size()-1 ; i++)
+			size_t prev_off = 0;
+			for (size_t i = 0; i < prc_sz.size() ; i++)
 			{
-				prc_r.add(prc_sz.template get<1>(i));
-				prc_sz_r.add(prc_sz.template get<0>(i+1) - prc_sz.template get<0>(i));
+				if (prc_sz.template get<1>(i) != (unsigned int)-1)
+				{
+					prc_r.add(prc_sz.template get<1>(i));
+					prc_sz_r.add(prc_sz.template get<0>(i) - prev_off);
+				}
+				prev_off = prc_sz.template get<0>(i);
 			}
 		}
 		else
@@ -554,8 +565,6 @@ class vector_dist_comm
 		template<typename T>
 		inline void operator()(T& t)
 		{
-			typedef typename boost::mpl::at<v_mpl,T>::type prp_ms;
-
 			g_send_prp.get(i).template setMemory<T::value>(hsmem.get(j));
 
 			j++;
@@ -672,7 +681,7 @@ class vector_dist_comm
 			               openfpm::vector<size_t> & prc_sz_r,
 			               openfpm::vector<openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>> & m_pos,
 			               openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>> & m_prp,
-			               size_t offset,
+			               openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> & prc_sz,
 			               size_t opt)
 	{
 		m_prp.resize(prc_sz_r.size());
@@ -691,6 +700,23 @@ class vector_dist_comm
 		{
 #if defined(CUDA_GPU) && defined(__NVCC__)
 
+			// The first part of m_opart and prc_sz contain the local particles
+
+			v_pos_tmp.resize(prc_sz.template get<0>(0));
+			v_prp_tmp.resize(prc_sz.template get<0>(0));
+
+			auto ite = v_pos_tmp.getGPUIterator();
+
+			// fi;l v_pos_tmp and v_prp_tmp with local particles
+			process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
+					                                           decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
+			<<<ite.wthr,ite.thr>>>
+			(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
+					            v_pos.toKernel(),v_prp.toKernel(),0);
+
+			size_t offset = prc_sz.template get<0>(0);
+
+			// Fill the sending fuffers
 			for (size_t i = 0 ; i < m_pos.size() ; i++)
 			{
 				auto ite = m_pos.get(i).getGPUIterator();
@@ -704,6 +730,10 @@ class vector_dist_comm
 				offset += prc_sz_r.size();
 			}
 
+			// old local particles with the actual local particles
+			v_pos_tmp.swap(v_pos);
+			v_prp_tmp.swap(v_prp);
+
 #else
 
 			std::cout << __FILE__ << ":" << __LINE__ << " error MAP_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
@@ -723,10 +753,10 @@ class vector_dist_comm
 			{
 				process_map_particle<proc_without_prp>(i,end,id_end,m_opart,p_map_req,m_pos,m_prp,v_pos,v_prp,cnt);
 			}
-		}
 
-		v_pos.resize(v_pos.size() - m_opart.size());
-		v_prp.resize(v_prp.size() - m_opart.size());
+			v_pos.resize(v_pos.size() - m_opart.size());
+			v_prp.resize(v_prp.size() - m_opart.size());
+		}
 	}
 
 
@@ -813,6 +843,7 @@ class vector_dist_comm
 
 			CudaMemory mem;
 			mem.allocate(sizeof(int));
+			mem.fill(0);
 
 			// Find the buffer bases
 			find_buffer_offsets<decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>>
@@ -820,8 +851,14 @@ class vector_dist_comm
 
 			// Trasfer the number of offsets on CPU
 			mem.deviceToHost();
+			prc_sz.template deviceToHost<0,1>();
+			// get also the last element from lbl_p;
+			lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1);
 
 			int noff = *(int *)mem.getPointer();
+			prc_sz.resize(noff+1);
+			prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size();
+			prc_sz.template get<1>(prc_sz.size()-1) = lbl_p.template get<1>(lbl_p.size()-1);
 
 #else
 
@@ -978,7 +1015,7 @@ public:
 	 *
 	 */
 	vector_dist_comm(const vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base> & v)
-	:v_cl(create_vcluster()),dec(create_vcluster()),lg_m(0)
+	:v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0)
 	{
 		this->operator=(v);
 	}
@@ -990,7 +1027,7 @@ public:
 	 *
 	 */
 	vector_dist_comm(const Decomposition & dec)
-	:v_cl(create_vcluster()),dec(dec),lg_m(0)
+	:v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0)
 	{
 
 	}
@@ -1001,7 +1038,7 @@ public:
 	 *
 	 */
 	vector_dist_comm(Decomposition && dec)
-	:v_cl(create_vcluster()),dec(dec),lg_m(0)
+	:v_cl(create_vcluster<Memory>()),dec(dec),lg_m(0)
 	{
 
 	}
@@ -1010,7 +1047,7 @@ public:
 	 *
 	 */
 	vector_dist_comm()
-	:v_cl(create_vcluster()),dec(create_vcluster()),lg_m(0)
+	:v_cl(create_vcluster<Memory>()),dec(create_vcluster()),lg_m(0)
 	{
 	}
 
@@ -1138,10 +1175,10 @@ public:
                 {
                 	size_t opt_ = compute_options(opt);
                 	op_ssend_gg_recv_merge opm(g_m);
-                    v_cl.SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
+                    v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
                 }
                 else
-                {v_cl.SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte);}
+                {v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte);}
 
                 // fill g_opart_sz
                 g_opart_sz.resize(prc_g_opart.size());
@@ -1161,13 +1198,13 @@ public:
 			if (opt & SKIP_LABELLING)
 			{
             	size_t opt_ = compute_options(opt);
-				v_cl.SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
+				v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get,opt_);
 			}
 			else
 			{
 				prc_recv_get.clear();
 				recv_sz_get.clear();
-				v_cl.SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get);
+				v_cl.template SSendRecv<send_pos_vector,decltype(v_pos),layout_base>(g_pos_send,v_pos,prc_g_opart,prc_recv_get,recv_sz_get);
 			}
 
             // fill g_opart_sz
@@ -1257,7 +1294,7 @@ public:
 		fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp);
 
 		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
-		v_cl.SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
+		v_cl.template SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
 
 		// mark the ghost part
 
@@ -1304,17 +1341,29 @@ public:
 		//! properties vector
 		openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>> m_prp;
 
-		fill_send_map_buf(v_pos,v_prp, prc_sz_r, m_pos, m_prp,prc_sz_r.get(0),opt);
+		fill_send_map_buf(v_pos,v_prp, prc_sz_r, m_pos, m_prp,prc_sz,opt);
+
+		size_t opt_ = 0;
+		if (opt & MAP_ON_DEVICE)
+		{
+#if defined(CUDA_GPU) && defined(__NVCC__)
+			// Before doing the communication on MAP_ON_DEVICE we have to be sure that the previous kernels complete
+			cudaDeviceSynchronize();
+			opt_ |= MPI_GPU_DIRECT;
+#else
+			std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option MAP_ON_DEVICE you must compile with NVCC" << std::endl;
+#endif
+		}
 
-		v_cl.SSendRecv<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>,
+		v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>,
 					   openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
 					   layout_base>
-					   (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
+					   (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_);
 
-		v_cl.SSendRecv<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>,
+		v_cl.template SSendRecv<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>,
 					   openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>,
 					   layout_base>
-					   (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
+					   (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_);
 
 		// mark the ghost part
 
@@ -1401,12 +1450,12 @@ public:
 			size_t opt_ = compute_options(opt);
 
 			op_ssend_recv_merge<op> opm(g_opart);
-			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_g_opart,g_opart_sz,opt_);
+			v_cl.template SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_g_opart,g_opart_sz,opt_);
 		}
 		else
 		{
 			op_ssend_recv_merge<op> opm(g_opart);
-			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_recv_put,recv_sz_put);
+			v_cl.template SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_recv_put,recv_sz_put);
 		}
 
 		// process also the local replicated particles
diff --git a/src/Vector/vector_dist_dlb_test.hpp b/src/Vector/vector_dist_dlb_test.hpp
index f753ca3a..428e30ce 100644
--- a/src/Vector/vector_dist_dlb_test.hpp
+++ b/src/Vector/vector_dist_dlb_test.hpp
@@ -16,7 +16,7 @@ BOOST_AUTO_TEST_SUITE( vector_dist_dlb_test )
 template<typename vector_type>
 void mp_test_template(vector_type & vd0, vector_type & vd1, vector_type & vd2, vector_type & vd3)
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	// Only processor 0 initialy add particles on a corner of a domain
 
@@ -213,7 +213,7 @@ void mp_test_template(vector_type & vd0, vector_type & vd1, vector_type & vd2, v
 
 template<typename vector_type> void test_dlb_vector()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 8)
 		return;
@@ -314,7 +314,7 @@ template<typename vector_type> void test_dlb_vector()
 
 template<typename vector_type> void test_dlb_multi_phase_vector()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 8)
 		return;
@@ -335,7 +335,7 @@ template<typename vector_type> void test_dlb_multi_phase_vector()
 
 template<typename vector_type> void test_dlb_multi_phase_v_vector()
 {
-	Vcluster & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
 	if (v_cl.getProcessingUnits() > 8)
 		return;
diff --git a/src/initialize/initialize_wrapper.hpp b/src/initialize/initialize_wrapper.hpp
new file mode 100644
index 00000000..79d8577c
--- /dev/null
+++ b/src/initialize/initialize_wrapper.hpp
@@ -0,0 +1,20 @@
+/*
+ * initialize_vcl.hpp
+ *
+ *  Created on: Aug 21, 2018
+ *      Author: i-bird
+ */
+
+#ifndef INITIALIZE_VCL_HPP_
+#define INITIALIZE_VCL_HPP_
+
+/*! \brief If openfpm has to work on GPU we have to be sure openfpm_init is called on a file compiled with NVCC
+ *
+ * There are two implementation initialize.cpp and initialize.cu. In configuration stage the second implementation is chosen
+ * if the test has to run on GPU
+ *
+ */
+void openfpm_init_wrapper(int * argc, char *** argv);
+void openfpm_finalize_wrapper();
+
+#endif /* INITIALIZE_VCL_HPP_ */
diff --git a/src/initialize/initialize_wrapper_cpu.cpp b/src/initialize/initialize_wrapper_cpu.cpp
new file mode 100644
index 00000000..4d75490c
--- /dev/null
+++ b/src/initialize/initialize_wrapper_cpu.cpp
@@ -0,0 +1,13 @@
+#include "initialize_wrapper.hpp"
+#include "VCluster/VCluster.hpp"
+
+
+void openfpm_init_wrapper(int * argc, char *** argv)
+{
+	openfpm_init(argc,argv);
+}
+
+void openfpm_finalize_wrapper()
+{
+	openfpm_finalize();
+}
diff --git a/src/initialize/initialize_wrapper_cuda.cu b/src/initialize/initialize_wrapper_cuda.cu
new file mode 100644
index 00000000..74a8473d
--- /dev/null
+++ b/src/initialize/initialize_wrapper_cuda.cu
@@ -0,0 +1,12 @@
+#include "initialize_wrapper.hpp"
+#include "VCluster/VCluster.hpp"
+
+void openfpm_init_wrapper(int * argc, char *** argv)
+{
+	openfpm_init(argc,argv);
+}
+
+void openfpm_finalize_wrapper()
+{
+	openfpm_finalize();
+}
diff --git a/src/unit_test_init_cleanup.hpp b/src/unit_test_init_cleanup.hpp
index 96e7743f..1c953dad 100644
--- a/src/unit_test_init_cleanup.hpp
+++ b/src/unit_test_init_cleanup.hpp
@@ -8,7 +8,7 @@
 #ifndef UNIT_TEST_INIT_CLEANUP_HPP_
 #define UNIT_TEST_INIT_CLEANUP_HPP_
 
-#include "VCluster/VCluster.hpp"
+#include "initialize/initialize_wrapper.hpp"
 
 const char * test_dir;
 
@@ -19,7 +19,7 @@ struct ut_start
     {
     	BOOST_TEST_MESSAGE("Initialize global VCluster");
 
-    	openfpm_init(&boost::unit_test::framework::master_test_suite().argc,&boost::unit_test::framework::master_test_suite().argv);
+    	openfpm_init_wrapper(&boost::unit_test::framework::master_test_suite().argc,&boost::unit_test::framework::master_test_suite().argv);
 
 #ifdef PERFORMANCE_TEST
     	test_dir = getenv("OPENFPM_PERFORMANCE_TEST_DIR");
@@ -35,7 +35,7 @@ struct ut_start
     ~ut_start()
     {
     	BOOST_TEST_MESSAGE("Delete global VClster");
-    	openfpm_finalize();
+    	openfpm_finalize_wrapper();
     }
 };
 
-- 
GitLab