diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f25e2668ba27e464683408fd60b065c23ccb3d4..f7b13ee3269a818ae03faebbc81dd9692b38164e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,19 +1,25 @@
 # Change Log
 All notable changes to this project will be documented in this file.
 
-## [1.1.0]
+## [1.1.0] February 2018
 
 ### Added
 
 - Interface for Multi-vector dynamic load balancing
+- Increaded performance for grid ghost get
+- Introduced forms to increase the performance of the grid iterator in case of stencil code (see example 5_GrayScott)
+- EMatrix wrapped eigen matrices compatibles with vector_dist_id
+- General tuning for high dimension vector_dist_id (up to 50 dimensions)
+- Added Discrete element Method example (8_DEM)
 
 ### Fixed
 
 - Installation/detection of PETSC
-- 2D Fixing IO in binary for vector
 - CRITICAL-BUG scalar product in combination with vector product is broken (it return 0)
+- Fixing 2D IO in binary for vector
+- Fixing 1D grid writer in ASCII mode
 
-## [1.0.0] 13 September 2017
+## [1.0.0] 13 September 2017 (Codename: Vortex)
 
 ### Added
 - Introduced getDomainIterator for Cell-list
diff --git a/configure.ac b/configure.ac
index 519f74fcbea28eb455cb7b4502eee2aee3d061bc..75e0932e20c6b990155e39add389456d6f622531 100644
--- a/configure.ac
+++ b/configure.ac
@@ -64,8 +64,7 @@ INCLUDES_PATH=" "
 echo "$base" > install_dir
 
 # Needed for build library
-AC_PROG_RANLIB
-AM_PROG_AR
+LT_INIT
 
 # Checks for programs.
 AC_PROG_CXX
diff --git a/dep_dir b/dep_dir
new file mode 100644
index 0000000000000000000000000000000000000000..1f7391f92b6a3792204e07e99f71f643cc35e7e1
--- /dev/null
+++ b/dep_dir
@@ -0,0 +1 @@
+master
diff --git a/example/Grid/0_simple/main.cpp b/example/Grid/0_simple/main.cpp
index 30ae5a9472b27365b89de2a6e0e4a5147fdcdafa..339d420e1d5b4d7738ec5c0d259ec0f5b6e8b232 100644
--- a/example/Grid/0_simple/main.cpp
+++ b/example/Grid/0_simple/main.cpp
@@ -9,6 +9,7 @@
  * \subpage Grid_2_solve_eq
  * \subpage Grid_3_gs
  * \subpage Grid_3_gs_3D
+ * \subpage Grid_3_gs_3D_vector
  *
  */
 
diff --git a/example/Grid/3_gray_scott_3d/Makefile b/example/Grid/3_gray_scott_3d/Makefile
index 170e428569ba200362dfa7bbc4a5d533f4006696..04db7e20fa78c9a519f254e71d61f27e47df69f8 100644
--- a/example/Grid/3_gray_scott_3d/Makefile
+++ b/example/Grid/3_gray_scott_3d/Makefile
@@ -7,7 +7,7 @@ LDIR =
 OBJ = main.o
 
 %.o: %.cpp
-	$(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
 
 gray_scott: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
diff --git a/example/Grid/3_gray_scott_3d/main.cpp b/example/Grid/3_gray_scott_3d/main.cpp
index 88e814cba1bc08693a77912e74c9049be081d104..7c7e8f4f2b842b709bd80b4413273f0d3943881e 100644
--- a/example/Grid/3_gray_scott_3d/main.cpp
+++ b/example/Grid/3_gray_scott_3d/main.cpp
@@ -6,6 +6,8 @@
  *
  * \page Grid_3_gs_3D Gray Scott in 3D
  *
+ * [TOC]
+ *
  * # Solving a gray scott-system in 3D # {#e3_gs_gray_scott}
  *
  * This example is just an extension of the 2D Gray scott example.
@@ -17,9 +19,25 @@
  * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
  * \endhtmlonly
  *
+ * More or less this example is the adaptation of the previous example to 3D
+ * with the improvement of using stencil iterator.
+ *
+ * ## Stencil iterator {#e3_gs_grat_scott_si}
+ *
+ * Stencil iterator require that you define a stencil,
+ *
+ * \snippet Grid/3_gray_scott_3d/main.cpp stencil def
+ *
+ * once is defined it is
+ * possible get and use a stencil iterator
+ *
+ * \snippet Grid/3_gray_scott_3d/main.cpp stencil get and use
+ *
+ * The rest of the example remain the same with the exception
+ * that the code has been extended in 3D.
+ *
  * \see \ref Grid_2_solve_eq
  *
- * \snippet Grid/3_gray_scott/main.cpp constants
  * 
  */
 
@@ -110,26 +128,10 @@ int main(int argc, char* argv[])
         double K = 0.053;
         double F = 0.014;
 
-	//! \cond [init lib] \endcond
-
-	/*!
-	 * \page Grid_3_gs_3D Gray Scott in 3D
-	 *
-	 * Here we create 2 distributed grid in 2D Old and New. In particular because we want that
-	 * the second grid is distributed across processors in the same way we pass the decomposition
-	 * of the Old grid to the New one in the constructor with **Old.getDecomposition()**. Doing this,
-	 * we force the two grid to have the same decomposition.
-	 *
-	 * \snippet Grid/3_gray_scott/main.cpp init grid
-	 *
-	 */
-
-	//! \cond [init grid] \endcond
-
 	grid_dist_id<3, double, aggregate<double,double>> Old(sz,domain,g,bc);
 
 	// New grid with the decomposition of the old grid
-        grid_dist_id<3, double, aggregate<double,double>> New(Old.getDecomposition(),sz,g);
+    grid_dist_id<3, double, aggregate<double,double>> New(Old.getDecomposition(),sz,g);
 
 	
 	// spacing of the grid on x and y
@@ -149,33 +151,39 @@ int main(int argc, char* argv[])
 	timer tot_sim;
 	tot_sim.start();
 
+	//! \cond [stencil def] \endcond
+
 	static grid_key_dx<3> star_stencil_3D[7] = {{0,0,0},
                                          	    {0,0,-1},
-						    {0,0,1},
-						    {0,-1,0},
-						    {0,1,0},
-						    {-1,0,0},
-						    {1,0,0}};
+												{0,0,1},
+												{0,-1,0},
+												{0,1,0},
+												{-1,0,0},
+												{1,0,0}};
+
+	//! \cond [stencil def] \endcond
 
 	for (size_t i = 0; i < timeSteps; ++i)
 	{
 		if (i % 300 == 0)
 			std::cout << "STEP: " << i << std::endl;
 
+		//! \cond [stencil get and use] \endcond
+
 		auto it = Old.getDomainIteratorStencil(star_stencil_3D);
 
 		while (it.isNext())
 		{
 			// center point
-			auto Cp = it.getStencil<0>();
+			auto Cp = it.getStencilGrid<0>();
 
 			// plus,minus X,Y,Z
-			auto mx = it.getStencil<1>();
-			auto px = it.getStencil<2>();
-			auto my = it.getStencil<3>();
-			auto py = it.getStencil<4>();
-			auto mz = it.getStencil<5>();
-			auto pz = it.getStencil<6>();
+			auto mx = it.getStencilGrid<1>();
+			auto px = it.getStencilGrid<2>();
+			auto my = it.getStencilGrid<3>();
+			auto py = it.getStencilGrid<4>();
+			auto mz = it.getStencilGrid<5>();
+			auto pz = it.getStencilGrid<6>();
 
 			// update based on Eq 2
 			New.get<U>(Cp) = Old.get<U>(Cp) + uFactor * (
@@ -206,6 +214,8 @@ int main(int argc, char* argv[])
 			++it;
 		}
 
+		//! \cond [stencil get and use] \endcond
+
 		// Here we copy New into the old grid in preparation of the new step
 		// It would be better to alternate, but using this we can show the usage
 		// of the function copy. To note that copy work only on two grid of the same
@@ -216,11 +226,11 @@ int main(int argc, char* argv[])
 		// After copy we synchronize again the ghost part U and V
 		Old.ghost_get<U,V>();
 
-		// Every 30 time step we output the configuration for
+		// Every 500 time step we output the configuration for
 		// visualization
-		if (i % 60 == 0)
+		if (i % 500 == 0)
 		{
-			Old.write_frame("output",count,VTK_WRITER | FORMAT_BINARY);
+			Old.save("output_" + std::to_string(count));
 			count++;
 		}
 	}
@@ -246,4 +256,13 @@ int main(int argc, char* argv[])
 	openfpm_finalize();
 
 	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D Gray Scott in 3D
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d/main.cpp
+	 *
+	 */
 }
diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9488bea01f7cb9eb49d5570ea692d8091ff9cfcb
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile
@@ -0,0 +1,27 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o update_new.o
+
+%.o: %.f90
+	mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore  -fimplicit-none  -mavx -O3 -c -g -o $@ $<
+
+%.o: %.cpp
+	$(CC) -O3 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH) -I/home/i-bird/VC/include
+
+gray_scott: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -L/home/i-bird/VC/lib  -lVc
+
+all: gray_scott
+
+run: all
+	mpirun -np 4 ./gray_scott
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott
+
diff --git a/example/Grid/3_gray_scott_3d_vectorization/config.cfg b/example/Grid/3_gray_scott_3d_vectorization/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/Grid/3_gray_scott_3d_vectorization/main.cpp b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f153be0dcdf63e7403b9570a72f96d68317c532c
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
@@ -0,0 +1,416 @@
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+#include "Vc/Vc"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+ *
+ * # Solving a gray scott-system in 3D # {#e3_gs_gray_scott_vector}
+ *
+ * This example is just an improved version of the previous 3D Gray scott example.
+ * In particular we do the following improvements we separate U and V in two grids
+ * in order to vectorize. Every loop now handle 4 double in case of AVX-256 and 2 double
+ * in case of SSE. We also avoid to use the function copy and we alternate the use of the
+ * fields New and Old. If at the first iteration we read from Old and we write on New in
+ * the second iteration we read from New and we write on Old. The last improvement is write
+ * on hdf5 rather that VTK. VTK writers are convenient but are slow for performances. HDF5
+ * files can be saved with **save()** reload with **load()** and after loading can be written
+ * on VTK with **write** this mean that HDF5 files can be easily converted into VTK in a second moment.
+ * Not only but because HDF5 files can be saved on multiple processors and reloaded on a different
+ * number of processors, you can use this method to stitch VTK files together.
+ *
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * \see \ref Grid_2_solve_eq
+ *
+ * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp constants
+ * 
+ */
+
+//! \cond [constants] \endcond
+
+//#define FORTRAN_UPDATE
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+extern "C" void update_new(const int* lo, const int* hi,
+                  double* u, const int* ulo, const int* uhi,
+                  double* v, const int* vlo, const int* vhi,
+                  double* flu, const int* fulo, const int* fuhi,
+                  double* flv, const int* fvlo, const int* fvhi,
+                  const double * dt, const double * uFactor, const double * vFactor, const double * F,
+                  const double * K);
+
+
+//! \cond [constants] \endcond
+
+void init(grid_dist_id<3,double,aggregate<double> > & OldU,
+		  grid_dist_id<3,double,aggregate<double> > & OldV,
+		  grid_dist_id<3,double,aggregate<double> > & NewU,
+		  grid_dist_id<3,double,aggregate<double> > & NewV,
+		  Box<3,double> & domain)
+{
+	auto it = OldU.getDomainIterator();
+
+	while (it.isNext())
+	{
+		// Get the local grid key
+		auto key = it.get();
+
+		// Old values U and V
+		OldU.get(key) = 1.0;
+		OldV.get(key) = 0.0;
+
+		// Old values U and V
+		NewU.get(key) = 0.0;
+		NewV.get(key) = 0.0;
+
+		++it;
+	}
+
+	long int x_start = OldU.size(0)*1.55f/domain.getHigh(0);
+	long int y_start = OldU.size(1)*1.55f/domain.getHigh(1);
+	long int z_start = OldU.size(1)*1.55f/domain.getHigh(2);
+
+	long int x_stop = OldU.size(0)*1.85f/domain.getHigh(0);
+	long int y_stop = OldU.size(1)*1.85f/domain.getHigh(1);
+	long int z_stop = OldU.size(1)*1.85f/domain.getHigh(2);
+
+	grid_key_dx<3> start({x_start,y_start,z_start});
+	grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+	auto it_init = OldU.getSubDomainIterator(start,stop);
+
+	while (it_init.isNext())
+	{
+		auto key = it_init.get();
+
+        OldU.get(key) = 0.5 + (((double)std::rand())/RAND_MAX -0.5)/10.0;
+        OldV.get(key) = 0.25 + (((double)std::rand())/RAND_MAX -0.5)/20.0;
+
+		++it_init;
+	}
+}
+
+
+//! \cond [vectorization] \endcond
+
+void step(grid_dist_id<3, double, aggregate<double>> & OldU,
+		  grid_dist_id<3, double, aggregate<double>> & OldV,
+		  grid_dist_id<3, double, aggregate<double>> & NewU,
+		  grid_dist_id<3, double, aggregate<double>> & NewV,
+		  grid_key_dx<3> (& star_stencil_3D)[7],
+		  Vc::double_v uFactor, Vc::double_v vFactor, double deltaT, double F, double K)
+{
+#ifndef FORTRAN_UPDATE
+
+	//! \cond [cpp_update] \endcond
+
+	WHILE_M(OldU,star_stencil_3D)
+			auto & U_old = GET_GRID_M(OldU);
+			auto & V_old = GET_GRID_M(OldV);
+
+			auto & U_new = GET_GRID_M(NewU);
+			auto & V_new = GET_GRID_M(NewV);
+	ITERATE_3D_M(Vc::double_v::Size)
+
+			// center point
+			auto Cp = it.getStencil<0>();
+
+			// plus,minus X,Y,Z
+			auto mx = it.getStencil<1>();
+			auto px = it.getStencil<2>();
+			auto my = it.getStencil<3>();
+			auto py = it.getStencil<4>();
+			auto mz = it.getStencil<5>();
+			auto pz = it.getStencil<6>();
+
+			//
+			Vc::double_v u_c(&U_old.get<0>(Cp),Vc::Unaligned);
+			Vc::double_v u_mz(&U_old.get<0>(mz),Vc::Unaligned);
+			Vc::double_v u_pz(&U_old.get<0>(pz),Vc::Unaligned);
+			Vc::double_v u_my(&U_old.get<0>(my),Vc::Unaligned);
+			Vc::double_v u_py(&U_old.get<0>(py),Vc::Unaligned);
+			Vc::double_v u_mx(&U_old.get<0>(mx),Vc::Unaligned);
+			Vc::double_v u_px(&U_old.get<0>(px),Vc::Unaligned);
+
+
+			Vc::double_v v_c(&V_old.get<0>(Cp),Vc::Unaligned);
+			Vc::double_v v_mz(&V_old.get<0>(mz),Vc::Unaligned);
+			Vc::double_v v_pz(&V_old.get<0>(pz),Vc::Unaligned);
+			Vc::double_v v_my(&V_old.get<0>(my),Vc::Unaligned);
+			Vc::double_v v_py(&V_old.get<0>(py),Vc::Unaligned);
+			Vc::double_v v_mx(&V_old.get<0>(mx),Vc::Unaligned);
+			Vc::double_v v_px(&V_old.get<0>(px),Vc::Unaligned);
+
+			Vc::double_v out1 = u_c + uFactor * (u_mz + u_pz +
+					                            u_my + u_py +
+												u_mx + u_px +
+											    - 6.0 * u_c) +
+												- deltaT * u_c * v_c * v_c
+												- deltaT * F * (u_c - 1.0);
+
+			Vc::double_v out2 = v_c + vFactor * (v_mz + v_pz +
+					               v_my + v_py +
+								   v_mx + v_px +
+								   - 6.0 * v_c ) +
+								   deltaT * u_c * v_c * v_c +
+								   - deltaT * (F+K) * v_c;
+
+			out1.store(&U_new.get<0>(Cp),Vc::Unaligned);
+			out2.store(&V_new.get<0>(Cp),Vc::Unaligned);
+	END_LOOP_M
+
+	//! \cond [cpp_update] \endcond
+
+#else
+
+	//! \cond [fort_update] \endcond
+
+	double uFactor_s = uFactor[0];
+	double vFactor_s = vFactor[0];
+
+	auto & ginfo = OldU.getLocalGridsInfo();
+
+	for (size_t i = 0 ; i < OldU.getN_loc_grid() ; i++)
+	{
+		auto & U_old = OldU.get_loc_grid(i);
+		auto & V_old = OldV.get_loc_grid(i);
+
+		auto & U_new = NewU.get_loc_grid(i);
+		auto & V_new = NewV.get_loc_grid(i);
+
+		int lo[3] = {(int)ginfo.get(i).Dbox.getLow(0),(int)ginfo.get(i).Dbox.getLow(1),(int)ginfo.get(i).Dbox.getLow(2)};
+		int hi[3] = {(int)ginfo.get(i).Dbox.getHigh(0),(int)ginfo.get(i).Dbox.getHigh(1),(int)ginfo.get(i).Dbox.getHigh(2)};
+
+		int ulo[3] = {0,0,0};
+		int uhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+		int nulo[3] = {0,0,0};
+		int nuhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+
+		int vlo[3] = {0,0,0};
+		int vhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+		int nvlo[3] = {0,0,0};
+		int nvhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+
+		update_new(lo,hi,
+				   (double *)U_old.getPointer(),ulo,uhi,
+				   (double *)V_old.getPointer(),vlo,vhi,
+				   (double *)U_new.getPointer(),nulo,nuhi,
+				   (double *)V_new.getPointer(),nulo,nvhi,
+				   &deltaT, &uFactor_s, &vFactor_s,&F,&K);
+	}
+
+	//! \cond [fort_update] \endcond
+
+#endif
+}
+
+//! \cond [vectorization] \endcond
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {256,256,256};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 1;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+	// Number of timesteps
+    size_t timeSteps = 5000;
+
+	// K and F (Physical constant in the equation)
+    double K = 0.053;
+    double F = 0.014;
+
+	//! \cond [init lib] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+	 *
+	 * Here we create 2 distributed grid in 3D Old and New splitting U and V in two different fields.
+	 *  In particular because we want that all the grids are distributed across processors in the same
+	 *   way we pass the decomposition of the first grid.
+	 *
+	 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp init grid
+	 *
+	 */
+
+	//! \cond [init grid] \endcond
+
+	grid_dist_id<3, double, aggregate<double>> OldU(sz,domain,g,bc);
+	grid_dist_id<3, double, aggregate<double>> OldV(OldU.getDecomposition(),sz,g);
+
+	// New grid with the decomposition of the old grid
+    grid_dist_id<3, double, aggregate<double>> NewU(OldU.getDecomposition(),sz,g);
+    grid_dist_id<3, double, aggregate<double>> NewV(OldV.getDecomposition(),sz,g);
+
+	// spacing of the grid on x and y
+
+	double spacing[3] = {OldU.spacing(0),OldU.spacing(1),OldU.spacing(2)};
+
+	init(OldU,OldV,NewU,NewV,domain);
+
+	//! \cond [init grid] \endcond
+
+	// sync the ghost
+	size_t count = 0;
+	OldU.template ghost_get<0>();
+	OldV.template ghost_get<0>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	Vc::double_v uFactor = deltaT * du/(spacing[x]*spacing[x]);
+	Vc::double_v vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+	timer tot_sim;
+	tot_sim.start();
+
+	static grid_key_dx<3> star_stencil_3D[7] = {{0,0,0},
+                                         	    {0,0,-1},
+						    {0,0,1},
+						    {0,-1,0},
+						    {0,1,0},
+						    {-1,0,0},
+						    {1,0,0}};
+
+	for (size_t i = 0; i < timeSteps; ++i)
+	{
+		if (i % 300 == 0)
+			std::cout << "STEP: " << i << std::endl;
+
+		/*!
+		 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+		 *
+		 * Alternate New and Old field to run one step, switch between old and new if the iteration
+		 * is even or odd. The function step is nothing else than the implementation of Gray-Scott
+		 * 3D in the previous example but in a more optimized way.
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp alternate
+		 *
+		 * In this function we show two methods to optimize this function.
+		 *
+		 * * We can use the macro **WHILE_M** passing the stencil definition, **ITERATE_3D** to define the loop,
+		 *  **END_LOOP** to close the loop, and use the function
+		 * function **getStencil<0>()** to retrieve the stencil points. Additionaly we can use Vc::double_v instead
+		 *  of double to vectorize the code. This method give the advantage to keep all the
+		 * code in C++.
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp cpp_update
+		 *
+		 * * Another possibility is to use FORTRAN. Because FORTRAN has better
+		 *  support for multi dimensional array another possibility is to process each local grid using
+		 *  FORTRAN, this also give us the opportunity to show hybrid code. We can switch between
+		 *   one and the other method commenting
+		 *  and uncommeting the line #define FORTRAN_UPDATE in the code.
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp fort_update
+		 *
+		 * \include Grid/3_gray_scott_3d_vectorization/update_new.f90
+		 *
+		 */
+
+		//! \cond [alternate] \endcond
+
+		if (i % 2 == 0)
+		{
+			step(OldU,OldV,NewU,NewV,star_stencil_3D,uFactor,vFactor,deltaT,F,K);
+
+			NewU.ghost_get<0>();
+			NewV.ghost_get<0>();
+		}
+		else
+		{
+			step(NewU,NewV,OldU,OldV,star_stencil_3D,uFactor,vFactor,deltaT,F,K);
+
+			OldU.ghost_get<0>();
+			OldV.ghost_get<0>();
+		}
+
+		//! \cond [alternate] \endcond
+
+		/*!
+		 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+		 *
+		 * Instead of using the function **write** we use the function **save** to save on HDF5
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp save hdf5
+		 *
+		 */
+
+		//! \cond [save hdf5] \endcond
+
+		// Every 2000 time step we output the configuration on hdf5
+		if (i % 2000 == 0)
+		{
+			OldU.save("output_u_" + std::to_string(count));
+			OldV.save("output_v_" + std::to_string(count));
+			count++;
+		}
+
+		//! \cond [save hdf5] \endcond
+	}
+	
+	tot_sim.stop();
+
+	if (create_vcluster().rank() == 0)
+	{std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;}
+
+	// We frite the final configuration
+	OldV.write("final");
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d_vectorization/main.cpp
+	 *
+	 */
+}
+
+
diff --git a/example/Grid/3_gray_scott_3d_vectorization/update_new.f90 b/example/Grid/3_gray_scott_3d_vectorization/update_new.f90
new file mode 100644
index 0000000000000000000000000000000000000000..5728a019ea7043dd6bc324287a77b7b48ada4079
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/update_new.f90
@@ -0,0 +1,48 @@
+subroutine update_new ( &
+                              lo, hi, &
+                               u, ulo, uhi, &
+                               v, vlo, vhi, &
+                               u_new, nulo, nuhi, &
+                               v_new, nvlo, nvhi, &
+                               dt, uFactor, vFactor,F,Kf) bind(C, name="update_new")
+
+  implicit none
+
+  integer, intent(in) :: lo(3), hi(3)
+  integer, intent(in) :: ulo(3), uhi(3)
+  integer, intent(in) :: vlo(3), vhi(3)
+  integer, intent(in) :: nulo(3), nuhi(3), nvlo(3), nvhi(3)
+  real*8, intent(in)    :: u  (ulo(1):uhi(1),ulo(2):uhi(2),ulo(3):uhi(3))
+  real*8, intent(in)    :: v  (vlo(1):vhi(1),vlo(2):vhi(2),vlo(3):vhi(3))
+  real*8, intent(inout) :: u_new( nulo(1): nuhi(1), nulo(2): nuhi(2), nulo(3): nuhi(3))
+  real*8, intent(inout) :: v_new( nvlo(1): nvhi(1), nvlo(2): nvhi(2), nvlo(3): nvhi(3))
+  real*8, intent(in) :: dt, F, Kf, uFactor, vFactor
+
+  ! local variables
+  integer i,j,k
+
+  ! x-fluxes
+  do       k = lo(3), hi(3)
+     do    j = lo(2), hi(2)
+        do i = lo(1), hi(1)
+           u_new(i,j,k) = u(i,j,k) + uFactor * ( u(i+1,j,k) + u(i-1,j,k) + &
+                                    u(i,j+1,k) + u(i,j-1,k) + &
+                                    u(i,j,k-1) + u(i,j,k+1) - &
+                                    6.0*u(i,j,k) ) - &
+                                    dt * u(i,j,k)*v(i,j,k)*v(i,j,k) - &
+                                    dt * F * (u(i,j,k) - 1.0)
+
+
+           v_new(i,j,k) = v(i,j,k) + vFactor * ( v(i+1,j,k) + v(i-1,j,k) + &
+                                    v(i,j+1,k) + v(i,j-1,k) + &
+                                    v(i,j,k-1) + v(i,j,k+1) - &
+                                    6.0*v(i,j,k) ) + &
+                                    dt * u(i,j,k)*v(i,j,k)*v(i,j,k) - &
+                                    dt * (F+Kf) * v(i,j,k)
+        end do
+     end do
+  end do
+
+
+end subroutine update_new
+
diff --git a/example/Numerics/PS-CMA-ES/Makefile b/example/Numerics/PS-CMA-ES/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ed12dd4edc5b71386367cc308ef374cb5d3fa
--- /dev/null
+++ b/example/Numerics/PS-CMA-ES/Makefile
@@ -0,0 +1,24 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+%.o: %.cpp
+	$(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+
+ps_cma_es: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
+
+all: ps_cma_es
+
+run: all
+	mpirun -np 2 ./ps_cma_es
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core ps_cma_es
+
diff --git a/example/Numerics/PS-CMA-ES/f15_cec_const.hpp b/example/Numerics/PS-CMA-ES/f15_cec_const.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0b94ab621cb893d09d7a51963a0adbb633ab7af
--- /dev/null
+++ b/example/Numerics/PS-CMA-ES/f15_cec_const.hpp
@@ -0,0 +1,26 @@
+#ifndef F15_CEC_CONST
+#define F15_CEC_CONST
+
+#include "Eigen/Dense"
+
+const double f15_const[10][100] = {3.3253000e+000,-1.2835000e+000,1.8984000e+000,-4.0950000e-001,8.8100000e-002,2.7580000e+000,9.7760000e-001,-1.8090000e+000,-2.4957000e+000,2.7367000e+000,-2.8961000e+000,-2.9413000e+000,3.2682000e+000,3.6495000e+000,5.0310000e-001,-4.2187000e+000,-7.8880000e-001,3.8384000e+000,1.3397000e+000,2.9312000e+000,-3.2869000e+000,3.6213000e+000,3.4834000e+000,-4.2260000e+000,-1.0784000e+000,-4.4730000e-001,2.7601000e+000,4.6200000e-002,4.3606000e+000,-2.0347000e+000,-3.5089000e+000,-2.2028000e+000,-1.6626000e+000,-1.7293000e+000,1.8300000e-002,-4.0478000e+000,-1.4034000e+000,-1.7866000e+000,-2.4212000e+000,-7.4340000e-001,-1.5438000e+000,-3.0272000e+000,2.6055000e+000,-3.1007000e+000,2.8010000e+000,1.4532000e+000,2.9887000e+000,1.2490000e-001,3.0169000e+000,-1.3140000e+000,3.4259000e+000,2.4992000e+000,9.7940000e-001,-1.8358000e+000,-3.8253000e+000,2.7079000e+000,1.9604000e+000,1.3197000e+000,-2.4128000e+000,-4.7550000e-001,2.1681000e+000,3.2344000e+000,3.3590000e+000,-1.4400000e-002,-1.8732000e+000,-1.9496000e+000,4.7110000e-001,8.2110000e-001,-1.9980000e-001,3.7350000e+000,-3.3720000e-001,-1.7267000e+000,-6.5950000e-001,-3.0531000e+000,-4.1052000e+000,2.3991000e+000,-3.2670000e-001,1.3279000e+000,3.7907000e+000,-1.8800000e+000,2.9056000e+000,1.8643000e+000,-5.2800000e-001,-1.9298000e+000,-8.5830000e-001,6.0570000e-001,-1.4152000e+000,4.0411000e+000,2.1519000e+000,3.6699000e+000,-1.3395000e+000,1.5661000e+000,-2.4511000e+000,-2.3292000e+000,1.2750000e-001,-3.0202000e+000,-1.0900000e+000,-3.6965000e+000,-1.1674000e+000,1.5488000e+000\
+,-2.2465000e+000,3.9382000e+000,-7.9990000e-001,-4.2205000e+000,-2.9393000e+000,-4.3433000e+000,1.6348000e+000,3.1011000e+000,-9.2690000e-001,-2.5328000e+000,1.1839000e+000,-2.9094000e+000,2.8157000e+000,-2.2581000e+000,-1.2020000e-001,4.1264000e+000,-3.5397000e+000,9.9270000e-001,-3.5492000e+000,3.5247000e+000,1.3420000e-001,-3.6413000e+000,1.8572000e+000,-3.8112000e+000,-5.1770000e-001,-4.3274000e+000,-3.9499000e+000,1.6129000e+000,-3.6689000e+000,3.7438000e+000,-2.7323000e+000,-3.7832000e+000,-8.9120000e-001,2.7046000e+000,4.3176000e+000,1.6330000e+000,-7.3310000e-001,-1.7864000e+000,7.8770000e-001,-7.4840000e-001,-4.2517000e+000,1.9060000e+000,-2.8498000e+000,-1.5533000e+000,4.6900000e-001,-2.4370000e+000,-1.4181000e+000,2.5617000e+000,2.5139000e+000,1.4288000e+000,-4.4704000e+000,1.7249000e+000,-3.3660000e+000,-3.3114000e+000,3.7927000e+000,4.3542000e+000,-4.1577000e+000,-2.0797000e+000,-4.1467000e+000,9.2950000e-001,-3.0638000e+000,3.2160000e-001,-3.3978000e+000,-1.0110000e+000,2.9571000e+000,-2.8415000e+000,3.9917000e+000,-8.3070000e-001,-2.4358000e+000,-4.4600000e-001,-4.3041000e+000,-1.6925000e+000,2.2654000e+000,3.4818000e+000,4.1474000e+000,-2.9950000e-001,-3.9534000e+000,4.3495000e+000,-1.9537000e+000,4.3235000e+000,1.8070000e+000,1.8991000e+000,4.3600000e-001,-2.7083000e+000,-2.8008000e+000,-3.8717000e+000,-2.8596000e+000,3.3523000e+000,1.0176000e+000,3.0750000e-001,1.0646000e+000,-2.1500000e+000,-2.8102000e+000,-4.4798000e+000,5.4910000e-001,-3.5628000e+000,-2.9116000e+000,4.2015000e+000,-4.0054000e+000,-1.7860000e+000\
+,1.7378000e+000,-4.4943000e+000,3.1910000e-001,-3.2414000e+000,1.2388000e+000,2.4878000e+000,-4.1218000e+000,-2.6083000e+000,-2.8100000e-001,3.4336000e+000,-9.1790000e-001,1.8598000e+000,3.0641000e+000,-3.6210000e-001,5.5760000e-001,-2.4441000e+000,-2.2366000e+000,-5.2660000e-001,4.3657000e+000,1.9230000e+000,4.1786000e+000,-4.0957000e+000,-3.1427000e+000,3.2389000e+000,1.5794000e+000,-4.2122000e+000,3.6131000e+000,-3.7048000e+000,-1.1209000e+000,-1.3074000e+000,3.3170000e-001,-7.8830000e-001,-4.1121000e+000,3.4491000e+000,8.0040000e-001,4.6400000e-001,-2.3190000e+000,-1.5837000e+000,2.6929000e+000,-1.0506000e+000,2.1719000e+000,-1.4665000e+000,2.9197000e+000,1.1862000e+000,2.7782000e+000,-1.8744000e+000,3.7281000e+000,-1.9880000e+000,1.7553000e+000,-2.2000000e-003,1.8093000e+000,-1.8750000e-001,1.0918000e+000,1.4909000e+000,-3.6830000e-001,-2.6889000e+000,3.4836000e+000,-3.8560000e+000,2.1800000e+000,2.3080000e-001,-3.9911000e+000,1.7939000e+000,2.1553000e+000,-6.7330000e-001,3.1100000e-002,3.1810000e-001,1.1508000e+000,4.0126000e+000,3.3660000e-001,1.6810000e-001,-2.7874000e+000,-2.8053000e+000,2.0216000e+000,-1.2160000e+000,-1.0253000e+000,-4.3512000e+000,1.0470000e+000,-4.2517000e+000,1.1389000e+000,1.9587000e+000,-2.1917000e+000,-3.0772000e+000,1.7597000e+000,-1.8843000e+000,6.5440000e-001,-3.1522000e+000,-2.9200000e+000,3.8666000e+000,-3.8922000e+000,8.7910000e-001,-3.5768000e+000,-8.5390000e-001,2.5625000e+000,4.2585000e+000,-4.4818000e+000,-3.8622000e+000,3.3085000e+000,-4.3596000e+000,8.1780000e-001,9.1200000e-001\
+,-1.5504000e+000,-4.3339000e+000,8.9780000e-001,-1.3839000e+000,4.2338000e+000,2.6283000e+000,-8.5900000e-001,-1.8942000e+000,1.7407000e+000,-1.2537000e+000,1.3161000e+000,-3.1738000e+000,2.8559000e+000,5.5400000e-002,4.4044000e+000,-9.9420000e-001,-4.2987000e+000,3.1463000e+000,2.7438000e+000,5.7790000e-001,-1.4390000e-001,1.2750000e-001,6.6570000e-001,-4.2310000e+000,-9.9960000e-001,-5.8970000e-001,-4.5240000e-001,1.2225000e+000,-2.2496000e+000,-8.3210000e-001,3.0274000e+000,-2.5083000e+000,-2.9410000e-001,4.0227000e+000,5.5750000e-001,-1.7082000e+000,-3.4266000e+000,-1.3186000e+000,-4.4435000e+000,3.8150000e+000,-2.4026000e+000,-1.9805000e+000,-3.9941000e+000,1.7521000e+000,-2.4486000e+000,-3.5640000e+000,2.8962000e+000,-4.0694000e+000,-2.1251000e+000,-6.6470000e-001,8.6770000e-001,1.6697000e+000,-3.1888000e+000,2.2569000e+000,-4.0996000e+000,-2.2851000e+000,3.3932000e+000,-2.1120000e+000,-4.2880000e-001,-2.5799000e+000,1.4335000e+000,1.8559000e+000,3.8649000e+000,-2.8086000e+000,-4.2600000e+000,3.8483000e+000,-1.1864000e+000,3.9864000e+000,3.1897000e+000,-3.5196000e+000,5.1700000e-002,-1.6459000e+000,6.2250000e-001,1.6034000e+000,-2.5510000e-001,2.3951000e+000,-2.9564000e+000,3.4064000e+000,-2.6920000e+000,-2.0972000e+000,-4.2272000e+000,-4.2909000e+000,-1.9785000e+000,2.6433000e+000,-2.4360000e+000,-3.6061000e+000,-3.3946000e+000,1.2882000e+000,1.3625000e+000,-2.2262000e+000,-5.2730000e-001,2.1000000e-002,-2.2951000e+000,3.5702000e+000,2.3550000e+000,-3.5396000e+000,4.3945000e+000,-7.5000000e-003,1.5150000e+000,-1.8038000e+000\
+,-2.7358000e+000,4.8530000e-001,2.8932000e+000,1.7750000e-001,1.5936000e+000,2.3591000e+000,3.4123000e+000,-3.1076000e+000,3.8696000e+000,2.3145000e+000,-3.7018000e+000,-4.3772000e+000,-2.1313000e+000,-1.1746000e+000,2.2956000e+000,1.3502000e+000,3.2284000e+000,1.6749000e+000,-4.3424000e+000,-1.0206000e+000,-4.3462000e+000,-1.7030000e+000,1.6973000e+000,-1.5694000e+000,-3.0295000e+000,4.1132000e+000,-2.1307000e+000,-3.9905000e+000,-4.0260000e-001,-1.6620000e+000,-2.4780000e-001,-3.2789000e+000,-1.6622000e+000,-1.2509000e+000,3.8842000e+000,-1.5414000e+000,3.8323000e+000,3.8892000e+000,9.1310000e-001,1.0501000e+000,9.8150000e-001,-2.0173000e+000,-1.4322000e+000,-2.4151000e+000,1.0046000e+000,-3.6765000e+000,-1.1788000e+000,2.7020000e-001,-2.8760000e-001,-9.5390000e-001,2.1957000e+000,-1.2470000e-001,-4.1052000e+000,3.5400000e+000,-4.3878000e+000,2.9056000e+000,4.2666000e+000,3.9671000e+000,3.5829000e+000,3.9816000e+000,-1.4905000e+000,-1.0256000e+000,2.7029000e+000,2.6330000e-001,-1.1798000e+000,2.3091000e+000,-2.5930000e-001,-7.0060000e-001,4.3161000e+000,3.7458000e+000,4.0370000e-001,-2.0975000e+000,4.3694000e+000,2.4717000e+000,4.1478000e+000,-3.5657000e+000,-1.0330000e-001,3.7534000e+000,1.9894000e+000,3.4790000e+000,-1.8541000e+000,3.8470000e+000,1.1231000e+000,4.4799000e+000,-3.9805000e+000,1.8823000e+000,-1.0147000e+000,4.1805000e+000,4.3017000e+000,1.4542000e+000,2.0347000e+000,2.7745000e+000,-1.6727000e+000,3.5495000e+000,-1.1870000e-001,-3.1237000e+000,-3.2825000e+000,-2.3430000e-001,1.4084000e+000,2.5355000e+000\
+,-1.8717000e+000,1.8285000e+000,-3.7085000e+000,2.1660000e+000,2.7980000e-001,-2.5635000e+000,-2.1321000e+000,-2.9868000e+000,-2.5391000e+000,-1.3354000e+000,-2.7000000e-001,-3.0712000e+000,-1.7479000e+000,4.2000000e-002,-3.8396000e+000,-3.3518000e+000,3.2361000e+000,3.9031000e+000,-3.8211000e+000,-9.0720000e-001,-4.2347000e+000,-3.2265000e+000,-1.3196000e+000,-4.0690000e-001,2.2933000e+000,-1.8828000e+000,-1.7421000e+000,-1.3624000e+000,2.2034000e+000,-1.5554000e+000,3.9148000e+000,3.0060000e+000,-2.7808000e+000,3.1430000e+000,-4.3288000e+000,3.1607000e+000,2.9885000e+000,3.4770000e+000,-3.2540000e+000,3.5659000e+000,-2.3289000e+000,-6.3320000e-001,-1.3463000e+000,1.4149000e+000,-3.1376000e+000,2.8234000e+000,-1.7904000e+000,1.4993000e+000,-2.8887000e+000,2.1925000e+000,-2.6683000e+000,-4.2745000e+000,3.9368000e+000,3.6760000e+000,3.2610000e-001,1.3481000e+000,-2.0259000e+000,-3.5200000e-001,4.5020000e-001,1.2222000e+000,-1.7436000e+000,3.0571000e+000,1.5054000e+000,5.9900000e-001,-3.7241000e+000,3.8567000e+000,-1.6305000e+000,-1.7861000e+000,-4.2061000e+000,3.1688000e+000,-2.5142000e+000,1.4365000e+000,-4.2584000e+000,-2.9095000e+000,-2.7201000e+000,-3.9780000e+000,-3.8197000e+000,3.2116000e+000,-2.0274000e+000,3.3790000e+000,2.0176000e+000,3.7376000e+000,-1.7626000e+000,3.1218000e+000,1.6559000e+000,3.3415000e+000,1.5242000e+000,1.7250000e+000,2.0883000e+000,3.4093000e+000,-1.7778000e+000,-2.4508000e+000,-1.7771000e+000,2.9953000e+000,-9.4000000e-003,-1.5103000e+000,4.3057000e+000,-1.2379000e+000,2.5305000e+000,-5.9280000e-001\
+,-3.9243000e+000,-2.7541000e+000,-2.9730000e-001,1.3850000e-001,-1.8557000e+000,-2.6559000e+000,-3.6582000e+000,-7.3010000e-001,-1.1095000e+000,3.3570000e+000,3.0578000e+000,-9.7650000e-001,3.6632000e+000,4.0256000e+000,-3.2321000e+000,-1.0683000e+000,-2.4430000e-001,-2.6259000e+000,-3.6503000e+000,-2.2150000e-001,3.5501000e+000,1.5428000e+000,-2.3871000e+000,3.3152000e+000,-3.1084000e+000,-6.1890000e-001,1.6179000e+000,-4.4262000e+000,3.7020000e-001,2.6440000e+000,-2.1320000e-001,-3.0420000e+000,3.9154000e+000,1.1186000e+000,-4.2283000e+000,-6.9900000e-002,-2.6670000e+000,-2.9021000e+000,8.3800000e-002,5.8690000e-001,4.4932000e+000,-2.5687000e+000,2.3839000e+000,-5.0720000e-001,-2.3617000e+000,-2.2267000e+000,-1.8803000e+000,4.0205000e+000,-4.3917000e+000,-1.3007000e+000,2.2299000e+000,-3.8293000e+000,-2.6600000e-002,-4.1760000e-001,3.4822000e+000,-4.3770000e+000,1.7739000e+000,-3.0710000e+000,-1.9767000e+000,2.6253000e+000,4.2554000e+000,-2.3100000e-002,9.2870000e-001,1.5838000e+000,-3.7755000e+000,-1.7040000e-001,-3.5334000e+000,1.5598000e+000,1.9987000e+000,8.3790000e-001,-4.3789000e+000,-9.0710000e-001,-2.3975000e+000,-3.0816000e+000,-2.6495000e+000,-1.1391000e+000,1.1763000e+000,-4.3940000e-001,-2.1198000e+000,-1.2030000e-001,3.7726000e+000,2.3534000e+000,-3.5806000e+000,4.4012000e+000,-2.5145000e+000,-3.7623000e+000,-2.4466000e+000,-1.0585000e+000,-3.6637000e+000,-3.1811000e+000,-2.3190000e-001,-3.0277000e+000,4.3686000e+000,-3.8050000e-001,5.2390000e-001,-3.1020000e+000,3.6013000e+000,-3.2767000e+000,8.7490000e-001,3.7456000e+000\
+,-4.4362000e+000,-1.0142000e+000,2.6180000e-001,2.5649000e+000,9.5110000e-001,-1.2742000e+000,-2.0990000e+000,2.0800000e-001,-2.2216000e+000,1.9331000e+000,-3.0628000e+000,2.5846000e+000,-3.4871000e+000,-1.4314000e+000,2.9744000e+000,-1.6853000e+000,1.7319000e+000,1.1900000e-002,-2.2333000e+000,-3.6649000e+000,6.9610000e-001,-1.3300000e+000,-4.0811000e+000,2.3000000e-001,1.0150000e+000,2.4349000e+000,2.6735000e+000,-4.0300000e+000,6.6320000e-001,-4.3627000e+000,3.9411000e+000,-2.8084000e+000,-1.2894000e+000,3.2805000e+000,8.7000000e-002,7.1320000e-001,1.8993000e+000,2.5630000e+000,-3.2035000e+000,1.8894000e+000,-1.0149000e+000,4.3570000e+000,2.6677000e+000,-3.4265000e+000,-2.6617000e+000,1.4306000e+000,-3.2000000e-001,3.4067000e+000,-6.8620000e-001,-2.6620000e-001,-2.2974000e+000,6.6110000e-001,-1.1390000e-001,-2.7930000e-001,-4.0534000e+000,-2.9895000e+000,2.4600000e+000,4.0680000e+000,6.1130000e-001,-2.1785000e+000,3.5749000e+000,4.2679000e+000,7.2050000e-001,-3.4930000e-001,-2.4683000e+000,2.8913000e+000,-2.7926000e+000,-4.4259000e+000,1.6203000e+000,-7.0780000e-001,6.6900000e-002,3.3265000e+000,-8.9650000e-001,3.8144000e+000,1.8644000e+000,-3.8740000e+000,-4.4329000e+000,-2.7533000e+000,1.5199000e+000,2.4951000e+000,-4.0784000e+000,-1.5782000e+000,2.4547000e+000,1.3106000e+000,-1.6517000e+000,-2.7385000e+000,-1.8217000e+000,2.5841000e+000,3.5956000e+000,-4.4051000e+000,1.9856000e+000,-1.0218000e+000,1.1335000e+000,3.2501000e+000,-1.1250000e-001,-3.3356000e+000,7.3370000e-001,8.1000000e-001,-3.4267000e+000,-1.7573000e+000\
+,-1.4198000e+000,3.6078000e+000,2.0771000e+000,2.7252000e+000,-3.3609000e+000,-2.0665000e+000,-3.8042000e+000,-3.9882000e+000,1.9212000e+000,2.7170000e+000,-3.6992000e+000,3.0584000e+000,3.1365000e+000,2.5994000e+000,-2.2861000e+000,3.2691000e+000,-4.3166000e+000,1.5112000e+000,-3.4301000e+000,-3.1168000e+000,-4.8610000e-001,-2.1520000e-001,-4.4595000e+000,-4.4585000e+000,2.7490000e+000,-3.6245000e+000,-4.0940000e+000,3.0953000e+000,1.8911000e+000,3.3585000e+000,1.4740000e-001,5.5140000e-001,-4.1774000e+000,2.6014000e+000,-1.8385000e+000,-3.6295000e+000,-2.1696000e+000,-8.1170000e-001,2.7015000e+000,-3.0258000e+000,-3.0333000e+000,-8.4910000e-001,-2.6945000e+000,2.5817000e+000,-3.9572000e+000,1.5827000e+000,4.4099000e+000,-1.1827000e+000,2.0778000e+000,-4.1023000e+000,-8.0790000e-001,-4.3809000e+000,-3.5968000e+000,3.3155000e+000,-2.2298000e+000,3.5023000e+000,-1.4810000e-001,1.9668000e+000,-2.9158000e+000,2.4857000e+000,4.1099000e+000,-4.1160000e+000,-1.6791000e+000,-3.2401000e+000,1.3775000e+000,9.0580000e-001,3.7376000e+000,2.2668000e+000,3.8236000e+000,-9.9970000e-001,1.4778000e+000,1.6980000e-001,-1.4464000e+000,7.0470000e-001,-2.8071000e+000,-8.7970000e-001,-1.8638000e+000,1.7548000e+000,-1.9996000e+000,1.5675000e+000,2.9245000e+000,-1.1090000e-001,-2.0504000e+000,-1.2071000e+000,-2.4366000e+000,-3.8301000e+000,4.1004000e+000,-4.1967000e+000,-9.3300000e-002,-1.1239000e+000,2.9965000e+000,9.2280000e-001,-1.4416000e+000,-9.5000000e-003,-3.3145000e+000,3.4467000e+000,-9.8670000e-001,2.5468000e+000,3.5220000e-001,3.4812000e+000\
+,-2.4369000e+000,-2.3015000e+000,1.1539000e+000,1.5121000e+000,-2.4170000e-001,2.0860000e-001,-1.4078000e+000,3.2117000e+000,1.6180000e-001,-2.6028000e+000,-5.2800000e-001,-1.7132000e+000,-4.4610000e-001,-3.6180000e+000,2.4411000e+000,1.5120000e+000,6.9000000e-002,-1.1301000e+000,-1.9017000e+000,-2.7420000e+000,-2.6130000e+000,-3.4728000e+000,2.7522000e+000,4.2066000e+000,-6.1710000e-001,-2.9867000e+000,3.4816000e+000,3.3065000e+000,-3.1555000e+000,2.4976000e+000,1.4455000e+000,-1.9139000e+000,-2.4110000e+000,-3.8611000e+000,4.4870000e+000,3.3597000e+000,1.9467000e+000,-3.0049000e+000,2.6107000e+000,3.7599000e+000,-2.3608000e+000,-4.4512000e+000,-2.7817000e+000,3.6587000e+000,-4.3400000e-002,1.3095000e+000,-4.2113000e+000,1.0998000e+000,1.5087000e+000,1.8103000e+000,2.7342000e+000,1.6980000e-001,4.1891000e+000,-4.2409000e+000,4.1945000e+000,-1.4423000e+000,2.0603000e+000,3.5687000e+000,2.2500000e-001,2.9241000e+000,3.2211000e+000,2.0296000e+000,2.3792000e+000,2.8299000e+000,1.1196000e+000,3.5512000e+000,-3.7236000e+000,4.1712000e+000,-7.0130000e-001,-3.1732000e+000,3.7700000e-002,4.3992000e+000,-1.0487000e+000,-3.2646000e+000,4.1234000e+000,-1.2268000e+000,3.0004000e+000,2.2522000e+000,-1.4002000e+000,2.0008000e+000,2.7492000e+000,3.2190000e-001,2.3040000e-001,1.9087000e+000,6.6720000e-001,9.4710000e-001,-1.3234000e+000,-1.1531000e+000,1.6404000e+000,3.7113000e+000,1.1623000e+000,2.8554000e+000,-2.2389000e+000,-1.4650000e+000,-3.6592000e+000,-7.3410000e-001,-2.8758000e+000,2.2515000e+000,1.5780000e+000,-3.7634000e+000};
+
+const double f15_cec_bias = 120;
+
+const double job_lambda[10] = {1.0,1.0,10.0,10.0,5.0/60.0,5.0/60.0,5.0/32.0,5.0/32.0,5.0/100.0,5.0/100.0};
+
+Eigen::VectorXd f15_o[10];
+
+double f15_max[10];
+double bias[10] = {0,100,200,300,400,500,600,700,800,900};
+
+#endif
diff --git a/example/Numerics/PS-CMA-ES/f15_cec_fun.hpp b/example/Numerics/PS-CMA-ES/f15_cec_fun.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5cd0578b9374ead42c1864cb892fe95fa38b5b2e
--- /dev/null
+++ b/example/Numerics/PS-CMA-ES/f15_cec_fun.hpp
@@ -0,0 +1,189 @@
+/*
+ * f15_cec_fun.hpp
+ *
+ *  Created on: Jan 14, 2018
+ *      Author: i-bird
+ */
+
+#ifndef EXAMPLE_NUMERICS_PS_CMA_ES_F15_CEC_FUN_HPP_
+#define EXAMPLE_NUMERICS_PS_CMA_ES_F15_CEC_FUN_HPP_
+
+#include "f15_cec_const.hpp"
+#include <limits>
+#include <math.h>
+
+template<unsigned int dim>
+void Job15(int funcnr,Eigen::VectorXd & vars,double & res)
+{
+      // local used vars
+      double sum,sum1,sum2,prod,e1,e2;
+      int i,j,k;
+      // weierstrass vars
+      int Kmax = 20;
+      const double a_c = 0.5;
+      const double b_c = 3.0;
+
+
+      if (funcnr < 2)
+      {
+        // rastrigin
+         sum = 10.0 * dim;
+         for (size_t i = 0 ; i < dim ; i++)
+         {
+            sum += vars(i)*vars(i);
+            sum -= 10.0*cos(2*M_PI*vars[i]);
+         }
+
+         res = sum;
+      }
+      else if (funcnr < 4)
+      {
+        // weierstrass
+         sum1 = 0.0;
+         sum2 = 0.0;
+         double a_k = 1.0;
+         double b_k = 1.0;
+         for (size_t i = 0 ; i < dim ; i++)
+         {
+            a_k = 1.0;
+            b_k = 1.0;
+            for (size_t j = 0 ; j <= Kmax ; j++, a_k *= a_c,b_k *= b_c)
+            {
+                sum1 = sum1 + a_k * cos((M_PI)*2.0 * b_k * (vars(i)+0.5));
+            }
+         }
+         a_k = 1.0;
+         b_k = 1.0;
+         for (size_t j = 0 ; j <= Kmax ; j++, a_k *= a_c, b_k *= b_c)
+         {
+            sum2 = sum2 + a_k * cos((M_PI)*2.0 * b_k * (0.5));
+         }
+         res = sum1 - sum2*dim;
+      }
+      else if (funcnr < 6)
+      {
+         // griewank
+         prod = 1;
+         sum = 0.0;
+         for (size_t i = 1 ; i <= dim ; i++)
+         {
+        	 sum= sum + (vars(i-1)*vars(i-1))/4000.0;
+        	 prod=prod * cos(vars(i-1)/(sqrt(double(i))));
+         }
+         res = sum-prod+1;
+      }
+      else if (funcnr < 8)
+      {
+        // ackley
+        e1 = 0.0;
+        e2 = 0.0;
+        for (size_t i = 0 ; i < dim ; i++)
+        {
+            e1 = e1 + vars(i)*vars(i);
+            e2 = e2 + cos(2.0*M_PI*vars(i));
+        }
+        res = exp(1.0) + 20.0 - 20*exp(-0.2*sqrt(e1/dim));
+        res = res - exp(e2/dim);
+      }
+      else if (funcnr <= 10)
+      {
+        // sphere
+        sum = vars.transpose() * vars;
+        res = sum;
+      }
+}
+
+template<unsigned int dim>
+double hybrid_composition(Eigen::VectorXd & vars)
+{
+	double ZBQLNOR;
+
+	//local used vars
+	double wMax,sumSqr,wSum,w1mMaxPow;
+	int i,j,k;
+	double sumF,t_res;
+	Eigen::VectorXd job_z[10];
+
+	for (size_t i = 0 ; i < 10 ; i++)
+	{job_z[i].resize(dim);}
+
+	double job_w[10];
+	double res = 0.0;
+
+	for (size_t i = 0 ; i < dim ; i++)
+	{
+		if (vars[i] < -5.0 || vars[i] > 5.0)
+    	{return std::numeric_limits<double>::infinity();}
+	}
+
+	// get the raw weights
+    wMax = - std::numeric_limits<double>::max();
+    for (size_t i = 0; i < 10 ; i++)
+    {
+    	sumSqr = 0.0;
+        //Shift the Input
+    	job_z[i] = vars - f15_o[i];
+        sumSqr += (job_z[i].transpose() * job_z[i]);
+
+        job_w[i] = exp(-1.0 * sumSqr / (2.0 * dim));
+
+        if (wMax < job_w[i])
+        {wMax = job_w[i];}
+    }
+
+    // Modify the weights
+    wSum = 0.0;
+
+    w1mMaxPow = 1.0 - wMax*wMax*wMax*wMax*wMax*wMax*wMax*wMax*wMax*wMax;
+    for (size_t i = 0; i < 10 ; i++)
+    {
+            if (job_w[i] != wMax)
+            {job_w[i] = job_w[i]* w1mMaxPow;};
+
+            wSum = wSum + job_w[i];
+    }
+
+    // Normalize the weights
+    for (size_t i = 0; i < 10 ; i++)
+    {job_w[i] /= wSum;}
+
+    sumF = 0.0;
+
+    for (size_t i = 0; i < 10 ; i++)
+    {
+    	job_z[i] = job_z[i] / job_lambda[i];
+
+        //calling the basic functions
+
+        Job15<dim>(i,job_z[i],t_res);
+
+    	sumF = sumF + job_w[i] * (2000.0*t_res/f15_max[i] + bias[i]);
+    }
+
+    res = sumF + 120;
+
+    return res;
+}
+
+template<unsigned int dim>
+void prepare_f15()
+{
+	// load f15_o
+	for (size_t j = 0 ; j < 10 ; j++)
+	{
+		Eigen::VectorXd fmp(dim);
+		f15_o[j].resize(dim);
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			f15_o[j](i) = f15_const[j][i];
+			fmp(i) = 5.0 / job_lambda[j];
+		}
+
+		double result;
+		Job15<dim>(j,fmp,result);
+
+		f15_max[j] = fabs(result);
+	}
+}
+
+#endif /* EXAMPLE_NUMERICS_PS_CMA_ES_F15_CEC_FUN_HPP_ */
diff --git a/example/Numerics/PS-CMA-ES/main.cpp b/example/Numerics/PS-CMA-ES/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b5fca8efe1f748fd8460cc31007f0b0d72fe939
--- /dev/null
+++ b/example/Numerics/PS-CMA-ES/main.cpp
@@ -0,0 +1,1169 @@
+/*!
+ *
+ * \page PS_CMA_ES Particle swarm CMA-ES Evolution strategy
+ *
+ *
+ * [TOC]
+ *
+ * # Optimization {#Opti_cma_es}
+ *
+ *
+ * In this example we show how to code PS-CMA-ES. This is just a simple variation to the
+ * CMA-ES, where you have multiple CMA-ES running. The the best solution across them is
+ * used to produce a drift velocity toward that point.
+ *
+ * ## Introduction {#ps_cme_es}
+ *
+ * In this example we try to find the global optimum of a function. In particular we are
+ * using the function F15 from the CEC 2005 benchmark test, to validate that PS-CMA-ES work.
+ * This example contain multiple files:
+ *
+ * * f15_cec_const.hpp definitions of constants for the F15 function
+ * * f15_cec_fun.hpp the function itself
+ *
+ * The function is quite complicated and for reference please refere to the function
+ *  F15 "Hybrid Composition" in the CEC 2005 test. The function can be called with
+ *  hybrid_composition<dim>(x) where dim is the dimensionality and x is the point
+ *   where is evaluated the function. The dimensionality can go from 1 to 50.
+ *
+ * Considering to have a function \f$ f \f$ from \f$ \mathbb{R}^{dim} \f$ to \f$ \mathbb{R} \f$,
+ * the algorithm use a set of particles to find in parallel the global optimum of a function.
+ *  The algorithm rather than try to find the global optimum
+ * sampling point randomly in the space, it uses a set of particles each of them having a gaussian
+ * sampling distribution \f$ e^{\sigma \cdot x^tCx} \f$ with C a \f$ dim \cdot dim \f$ matrix.
+ * At each step for each particle p **lambda** points are sampled using the sampling
+ * distribution centered on the particle position. The covariant matrix and sigma are is subsequently
+ * adjusted to favor sampling around the best sampled points. In order to do this the algorithm
+ * need the eigen-value decomposition of \f$ C = B^{t}DB \f$ where \f$ D \f$ is a diagonal
+ * Matrix and \f$ B \f$ is the Matrix of the Eigen-vector. In order to reduce or increase
+ * the sampling area the sigma is instead used. The algorithm use the vector **path_s** to
+ * detect stagnation of the particle movement, and use **path_c**(a transfomed version of **path_s**)
+ * to refine the sampling covariant matrix from the fact that the particle is "moving" toward that
+ * direction. PS-CMA-ES is just a variation in which every **N_pos** CMA-Es steps the CMA-ES is
+ * sampling distribution and position is biased toward the best founded point across all independent
+ * CMA-ES.
+ *
+ * Explain the CMA-ES algorithm in detail is out of the purpose of this tutorial example.
+ * We will briefly go across the main step. For a full reference of the CMA-ES
+ * algoritm please refers to <a href="https://arxiv.org/abs/1604.00772">this paper</a>.
+ * While for PS-CMA-ES refers to <a href="http://mosaic.mpi-cbg.de/docs/Mueller2009a.pdf">this paper</a>.
+ *
+ *
+ * ## Inclusions {#inclusions_and_constants}
+ *
+ * In this example we use a set of particles so we will use **vector_dist**, we will use
+ * Eigen dense matrix. Because the standard dense matrix are not compatible with
+ * the vector_dist we will use **EMatrix** that are simple wrapper to Eigen::Matrix
+ * but compatible with vector_dist. Because EMatrix are compatible with all the
+ * Eigen value functions we can use them in all Eigen functions. For CMA-ES algorithm
+ * we also need Eigen-value eigen-vector decomposition and Jacobi-Rotation for the
+ * particle-swarm part.
+ *
+ * \snippet Numerics/PS-CMA-ES/main.cpp ps_cma_es_inclusion
+ *
+ * PS-CMA-ES require several properties to be stored on the particles, some has been already
+ * explained. Here we explain the others.
+ *
+ * * **Zeta** contain the lambda sampled points (before apply the covariant matrix tranformation)
+ *        so it contain points samples on a gaussian of sigma=1 centered in zero
+ *
+ * * **ord** Contain the sequrnce if we want the lambda generated points in order from the best to
+ *       the worst
+ *
+ * * **stop** If a flag that indicate that the CMA-ES reached some stop criteria
+ *
+ * * **fithist** It contain historical information about the particles to penalize them in case the
+ *       go out of boundary. It is 1:1 taken from cmaes.m (production version)
+ *       <a href="https://www.lri.fr/~hansen/cmaes_inmatlab.html">this paper</a> (or Google it)
+ *
+ * * **weight** Same concept of fithist other information to penalize particles going out of
+ *       the boundary
+ *
+ * * **validfit** Same concept of fithist other information to penalize particles going out of
+ *       the boundary
+ *
+ * * **xold** It contain the previous position of the particles used in several calculations
+ *
+ * * **last_restart** CMA-ES The CMA-ES sigma become very small the CMA-ES converged. At this point
+ *       we can do two things, one is to stop the CMA-ES, the other is to restart-it to explore
+ *       better the space. In case it restart. this parameter indicate at which iteration happen the
+ *       last restart.
+ *
+ * * **iniphase** Same concept of fithist other information to penalize particles going out of
+ *       the boundary
+ *
+ * * **xmean_st** This contain the new position of the particle it will be stored as particle position
+ *       at the end of the CMA-ES step
+ *
+ * * **xmean_st** This contain the new position of the particle in a space where we do not apply the
+ *       covariant transformation. (In practice is a weighted sum of the Zeta samples points)
+ *
+ * \snippet Numerics/PS-CMA-ES/main.cpp def_part_set
+ *
+ * ## Parameters {#ps_cma_par}
+ *
+ * CMA-ES and further PS-CMA-ES require some parameters in order to work. refers to the
+ * papers above to have a full explanation, but here is a short one
+ *
+ * * **dim** Dimensionality of the test function
+ *
+ * * **lambda** number of sample points taken at each iteration by CMA-ES
+ *              suggested to use \f$ 4+floor(3*log(dim)) \f$
+ *
+ * * **mu** only mu best points are considered to adapt the Covariant matrix
+ *
+ * * **psoWeight** How much the pso step bias the particle positions
+ *
+ * * **N_pso** Number of CMA-ES step before do a PSO step (200 give the possibility
+ *   to the CMA-ES to explore the neighborhood and narrow down at least a funnel)
+ *
+ * * **stopTolX** stop criteria for CMA-ES. When the the sampling area is small enough
+ *   stop
+ *
+ * * **StopToUpX** stop criteria is the sampling area become too big
+ *
+ * * **restart_cma** If the CMA-ES reach a stop criteria reinitialize and restart
+ *
+ * * **hist_size** size of the array fit_hist (default should be mainly fine)
+ *
+ */
+
+//! [ps_cma_es_inclusion]
+
+#define EIGEN_USE_LAPACKE
+#include "Vector/vector_dist.hpp"
+#include "DMatrix/EMatrix.hpp"
+#include <Eigen/Eigenvalues>
+#include <Eigen/Jacobi>
+#include <limits>
+#include "Vector/vector_dist.hpp"
+#include <f15_cec_fun.hpp>
+#include <boost/math/special_functions/sign.hpp>
+
+//! [ps_cma_es_inclusion]
+
+//! [parameters]
+
+// PARAMETERS
+constexpr int dim = 10;
+// when you set dim set also lambda to to 4+std::floor(3*log(dim))
+constexpr int lambda = 7;
+constexpr int mu = lambda/2;
+double psoWeight = 0.7;
+// number of cma-step before pso step
+int N_pso = 200;
+double stopTolX = 2e-11;
+double stopTolUpX = 2000.0;
+int restart_cma = 1;
+size_t max_fun_eval = 30000000;
+constexpr int hist_size = 21;
+
+// Convenient global variables (Their value is set after)
+double mu_eff = 1.0;
+double cs = 1.0;
+double cc = 1.0;
+double ccov = 1.0;
+double chiN;
+double d_amps = 1.0;
+double stop_fitness = 1.0;
+int eigeneval = 0;
+double t_c = 0.1;
+double b = 0.1;
+
+//! [parameters]
+
+//! [def_part_set]
+
+//////////// definitions of the particle set
+
+constexpr int sigma = 0;
+constexpr int Cov_m = 1;
+constexpr int B = 2;
+constexpr int D = 3;
+constexpr int Zeta = 4;
+constexpr int path_s = 5;
+constexpr int path_c = 6;
+constexpr int ord = 7;
+constexpr int stop = 8;
+constexpr int fithist = 9;
+constexpr int weight = 10;
+constexpr int validfit = 11;
+constexpr int xold = 12;
+constexpr int last_restart = 13;
+constexpr int iniphase = 14;
+constexpr int xmean_st = 15;
+constexpr int meanz_st = 16;
+
+typedef vector_dist<dim,double, aggregate<double,
+										 Eigen::MatrixXd,
+										 Eigen::MatrixXd,
+										 Eigen::DiagonalMatrix<double,Eigen::Dynamic>,
+										 Eigen::VectorXd[lambda],
+										 Eigen::VectorXd,
+										 Eigen::VectorXd,
+										 int[lambda],
+										 int,
+										 double [hist_size],
+										 double [dim],
+										 double,
+										 Eigen::VectorXd,
+										 int,
+										 bool,
+										 Eigen::VectorXd,
+										 Eigen::VectorXd> > particle_type;
+
+//! [def_part_set]
+
+
+double generateGaussianNoise(double mu, double sigma)
+{
+	static const double epsilon = std::numeric_limits<double>::min();
+	static const double two_pi = 2.0*3.14159265358979323846;
+
+	thread_local double z1;
+	thread_local double generate;
+	generate = !generate;
+
+	if (!generate)
+	{return z1 * sigma + mu;}
+
+	double u1, u2;
+	do
+	{
+	   u1 = rand() * (1.0 / RAND_MAX);
+	   u2 = rand() * (1.0 / RAND_MAX);
+	}
+	while ( u1 <= epsilon );
+
+	double z0;
+	z0 = sqrt(-2.0 * log(u2)) * cos(two_pi * u1);
+	z1 = sqrt(-2.0 * log(u2)) * sin(two_pi * u1);
+	return z0 * sigma + mu;
+}
+
+template<unsigned int dim>
+Eigen::VectorXd generateGaussianVector()
+{
+	Eigen::VectorXd tmp;
+	tmp.resize(dim);
+
+	for (size_t i = 0 ; i < dim ; i++)
+	{
+		tmp(i) = generateGaussianNoise(0,1);
+	}
+
+	return tmp;
+}
+
+template<unsigned int dim>
+void fill_vector(double (& f)[dim], Eigen::VectorXd & ev)
+{
+	for (size_t i = 0 ; i < dim ; i++)
+	{ev(i) = f[i];}
+}
+
+void fill_vector(const double * f, Eigen::VectorXd & ev)
+{
+	for (size_t i = 0 ; i < ev.size() ; i++)
+	{ev(i) = f[i];}
+}
+
+struct fun_index
+{
+	double f;
+	int id;
+
+	bool operator<(const fun_index & tmp)
+	{
+		return f < tmp.f;
+	}
+};
+
+double wm[mu];
+
+void init_weight()
+{
+	for (size_t i = 0 ; i < mu ; i++)
+	{wm[i] = log(double(mu)+1.0) - log(double(i)+1.0);}
+
+	double tot = 0.0;
+
+	for (size_t i = 0 ; i < mu ; i++)
+	{tot += wm[i];}
+
+	double sum = 0.0;
+	double sum2 = 0.0;
+
+	for (size_t i = 0 ; i < mu ; i++)
+	{
+		wm[i] /= tot;
+		sum += wm[i];
+		sum2 += wm[i]*wm[i];
+	}
+
+	// also set mu_eff
+    mu_eff=sum*sum/sum2;
+
+}
+
+double weight_sample(int i)
+{
+	return wm[i];
+}
+
+
+void create_rotmat(Eigen::VectorXd & S,Eigen::VectorXd & T, Eigen::MatrixXd & R)
+{
+	Eigen::VectorXd S_work(dim);
+	Eigen::VectorXd T_work(dim);
+	Eigen::VectorXd S_sup(dim);
+	Eigen::VectorXd T_sup(dim);
+
+	Eigen::MatrixXd R_tar(dim,dim);
+	Eigen::MatrixXd R_tmp(dim,dim);
+	Eigen::MatrixXd R_sup(dim,dim);
+	double G_S,G_C;
+	Eigen::MatrixXd S_tmp(2,2);
+	Eigen::MatrixXd T_tmp(2,2);
+	int p,q,i;
+
+	S_work = S;
+	T_work = T;
+
+	R.setIdentity();
+	R_tar = R;
+	R_tmp = R;
+
+	for (p = dim - 2; p >= 0 ; p -= 1)
+	{
+
+		for (q = dim - 1 ; q >= p+1 ; q-= 1)
+		{
+			T_tmp(0) = T_work(p);
+			T_tmp(1) = T_work(q);
+			S_tmp(0) = S_work(p);
+			S_tmp(1) = S_work(q);
+
+			// Perform Givens Rotation on start vector
+
+			Eigen::JacobiRotation<double> G;
+			double z;
+			G.makeGivens(S_tmp(0), S_tmp(1),&z);
+
+			// Check direction of rotation
+			double sign = 1.0;
+			if (z < 0.0)
+			{sign = -1.0;}
+
+			// Build a Rotation Matrix out of G_C and G_S
+			R_tmp.setIdentity();
+			R_tmp(p,p) = sign*G.c();
+			R_tmp(q,q) = sign*G.c();
+			R_tmp(p,q) = sign*-G.s();
+			R_tmp(q,p) = sign*G.s();
+
+			// Rotate start vector and update R
+			// S_work = R_tmp*S_work
+
+			S_work = R_tmp*S_work;
+			// R = R_tmp*R
+			R = R_tmp*R;
+
+			// Perform Givens Rotation on target vector
+
+			G.makeGivens(T_tmp(0), T_tmp(1),&z);
+
+			sign = 1.0;
+			if (z < 0.0)
+			{sign = -1.0;}
+
+			R_tmp.setIdentity();
+			R_tmp(p,p) = sign*G.c();
+			R_tmp(q,q) = sign*G.c();
+			R_tmp(p,q) = sign*-G.s();
+			R_tmp(q,p) = sign*G.s();
+
+			// Rotate target vector and update R_tar
+
+			T_work = R_tmp*T_work;
+			R_tar = R_tmp*R_tar;
+		}
+	}
+
+	R = R_tar.transpose()*R;
+
+	// Check the rotation
+
+	Eigen::VectorXd Check(dim);
+	Check = R*S;
+}
+
+void updatePso(openfpm::vector<double> & best_sol,
+			   double sigma,
+			   Eigen::VectorXd & xmean,
+			   Eigen::VectorXd & xold,
+			   Eigen::MatrixXd & B,
+			   Eigen::DiagonalMatrix<double,Eigen::Dynamic> & D,
+			   Eigen::MatrixXd & C_pso)
+{
+	Eigen::VectorXd best_sol_ei(dim);
+
+	double bias_weight = psoWeight;
+	fill_vector(&best_sol.get(0),best_sol_ei);
+	Eigen::VectorXd gb_vec = best_sol_ei-xmean;
+	double gb_vec_length = sqrt(gb_vec.transpose() * gb_vec);
+	Eigen::VectorXd b_main = B.col(dim-1);
+	Eigen::VectorXd bias(dim);
+	bias.setZero();
+
+	// Rotation Matrix
+	Eigen::MatrixXd R(dim,dim);
+
+	if (gb_vec_length > 0.0)
+	{
+	    if(sigma < gb_vec_length)
+	    {
+	    	if(sigma/gb_vec_length <= t_c*gb_vec_length)
+	    	{bias = 0.5*gb_vec;}
+	    	else
+	    	{bias = sigma*gb_vec/gb_vec_length;}
+	    }
+	    else
+	    {bias.setZero();}
+	}
+
+	  xmean = xmean + bias;
+
+	  if (psoWeight < 1.0)
+	  {
+		  Eigen::MatrixXd B_rot(dim,dim);
+		  Eigen::DiagonalMatrix<double,Eigen::Dynamic> D_square(dim);
+
+		  Eigen::VectorXd gb_vec_old = best_sol_ei - xold;
+		  create_rotmat(b_main,gb_vec_old,R);
+		  for (size_t i = 0 ; i < dim ; i++)
+		  {B_rot.col(i) = R*B.col(i);}
+
+		  for (size_t i = 0 ; i < dim ; i++)
+		  {D_square.diagonal()[i] = D.diagonal()[i] * D.diagonal()[i];}
+		  C_pso = B_rot * D_square * B_rot.transpose();
+
+		  Eigen::MatrixXd trUp = C_pso.triangularView<Eigen::Upper>();
+		  Eigen::MatrixXd trDw = C_pso.triangularView<Eigen::StrictlyUpper>();
+		  C_pso = trUp + trDw.transpose();
+	  }
+}
+
+void broadcast_best_solution(particle_type & vd,
+							 openfpm::vector<double> & best_sol,
+							 double & best,
+							 double best_sample,
+							 openfpm::vector<double> & best_sample_sol)
+{
+	best_sol.resize(dim);
+	auto & v_cl = create_vcluster();
+
+	double best_old = best_sample;
+	v_cl.min(best_sample);
+	v_cl.execute();
+
+	// The old solution remain the best
+	if (best < best_sample)
+	{return;}
+
+	best = best_sample;
+
+	size_t rank;
+	if (best_old == best_sample)
+	{
+		rank = v_cl.getProcessUnitID();
+
+		// we own the minimum and we decide who broad cast
+		v_cl.min(rank);
+		v_cl.execute();
+
+		if (rank == v_cl.getProcessUnitID())
+		{
+			for (size_t i = 0 ; i < dim ; i++)
+			{best_sol.get(i) = best_sample_sol.get(i);}
+		}
+	}
+	else
+	{
+		rank = std::numeric_limits<size_t>::max();
+
+		// we do not own  decide who broad cast
+		v_cl.min(rank);
+		v_cl.execute();
+	}
+
+	// now we broad cast the best solution across processors
+
+	v_cl.Bcast(best_sol,rank);
+	v_cl.execute();
+}
+
+void cmaes_myprctile(openfpm::vector<fun_index> & f_obj, double (& perc)[2], double (& res)[2])
+{
+	double sar[lambda];
+	double availablepercentiles[lambda];
+	int idx[hist_size];
+	int i,k;
+
+	for (size_t i = 0 ; i < lambda ; i++)
+	{
+		availablepercentiles[i] = 0.0;
+		sar[i] = f_obj.get(i).f;
+	}
+	std::sort(&sar[0],&sar[lambda]);
+
+	for (size_t i = 0 ; i < 2 ; i++)
+	{
+		if (perc[i] <= (100.0*0.5/lambda))
+		{res[i] = sar[0];}
+		else if (perc[i] >= (100.0*(lambda-0.5)/lambda) )
+		{res[i] = sar[lambda-1];}
+		else
+		{
+			for (size_t j = 0 ; j < lambda ; j++)
+			{availablepercentiles[j] = 100.0 * ((double(j)+1.0)-0.5) / lambda;}
+
+			for (k = 0 ; k < lambda ; k++)
+			{if(availablepercentiles[k] >= perc[i]) {break;}}
+			k-=1;
+
+			res[i] = sar[k] + (sar[k+1]-sar[k]) * (perc[i]
+							-availablepercentiles[k]) / (availablepercentiles[k+1] - availablepercentiles[k]);
+		}
+	}
+}
+
+double maxval(double (& buf)[hist_size], bool (& mask)[hist_size])
+{
+	double max = 0.0;
+	for (size_t i = 0 ; i < hist_size ; i++)
+	{
+		if (buf[i] > max && mask[i] == true)
+		{max = buf[i];}
+	}
+
+	return max;
+}
+
+double minval(double (& buf)[hist_size], bool (& mask)[hist_size])
+{
+	double min = std::numeric_limits<double>::max();
+	for (size_t i = 0 ; i < hist_size ; i++)
+	{
+		if (buf[i] < min && mask[i] == true)
+		{min = buf[i];}
+	}
+
+	return min;
+}
+
+void cmaes_intobounds(Eigen::VectorXd & x, Eigen::VectorXd & xout,bool (& idx)[dim], bool & idx_any)
+{
+	idx_any = false;
+	for (size_t i = 0; i < dim ; i++)
+	{
+		if(x(i) < -5.0)
+		{
+			xout(i) = -5.0;
+			idx[i] = true;
+			idx_any = true;
+		}
+		else if (x(i) > 5.0)
+		{
+			xout(i) = 5.0;
+			idx[i] = true;
+			idx_any = true;
+		}
+		else
+		{
+			xout(i) = x(i);
+			idx[i] = false;
+		}
+	}
+}
+
+void cmaes_handlebounds(openfpm::vector<fun_index> & f_obj,
+						double sigma,
+						double & validfit,
+						Eigen::VectorXd (& arxvalid)[lambda],
+						Eigen::VectorXd (& arx)[lambda],
+						Eigen::MatrixXd & C,
+						Eigen::VectorXd & xmean,
+						Eigen::VectorXd & xold,
+						double (& weight)[dim],
+						double (& fithist)[hist_size],
+						bool & iniphase,
+						double & validfitval,
+						double mu_eff,
+						int step,
+						int last_restart)
+{
+	double val[2];
+	double value;
+	double diag[dim];
+	double meandiag;
+	int i,k,maxI;
+	bool mask[hist_size];
+	bool idx[dim];
+	Eigen::VectorXd tx(dim);
+	int dfitidx[hist_size];
+	double dfitsort[hist_size];
+	double prct[2] = {25.0,75.0};
+	bool idx_any;
+
+	for (size_t i = 0 ; i < hist_size ; i++)
+	{
+		dfitsort[i] = 0.0;
+		dfitidx[i] = 0;
+
+		if (fithist[i] > 0.0)
+		{mask[i] = true;}
+		else
+		{mask[i] = false;}
+	}
+
+	for (size_t i = 0 ; i < dim ; i++)
+	{diag[i] = C(i,i);}
+
+	maxI = 0;
+
+	meandiag = C.trace()/dim;
+
+	cmaes_myprctile(f_obj, prct, val);
+	value = (val[1] - val[0]) / dim / meandiag / (sigma*sigma);
+
+	if (value >= std::numeric_limits<double>::max())
+	{
+		auto & v_cl = create_vcluster();
+		std::cout << "Process " << v_cl.rank() << " warning: Non-finite fitness range" << std::endl;
+		value = maxval(fithist,mask);
+	}
+	else if(value == 0.0)
+	{
+		value = minval(fithist,mask);
+	}
+	else if (validfit == 0.0)
+	{
+		for (size_t i = 0 ; i < hist_size ; i++)
+		{fithist[i] = -1.0;}
+		validfit = 1;
+	}
+
+	for (size_t i = 0; i < hist_size ; i++)
+	{
+		if(fithist[i] < 0.0)
+		{
+			fithist[i] = value;
+			maxI = i;
+			break;
+		}
+		else if(i == hist_size-1)
+		{
+			for (size_t k = 0 ; k < hist_size-1 ; k++)
+			{fithist[k] = fithist[k+1];}
+			fithist[i] = value;
+			maxI = i;
+		}
+	}
+
+	cmaes_intobounds(xmean,tx,idx,idx_any);
+
+	if (iniphase)
+	{
+		if (idx_any)
+		{
+			if(maxI == 0)
+			{value = fithist[0];}
+			else
+			{
+				openfpm::vector<fun_index> fitsort(maxI+1);
+				for (size_t i = 0 ; i <= maxI; i++)
+				{
+					fitsort.get(i).f = fithist[i];
+					fitsort.get(i).id = i;
+				}
+
+				fitsort.sort();
+				for (size_t k = 0; k <= maxI ; k++)
+				{fitsort.get(k).f = fithist[fitsort.get(k).id];}
+
+				if ((maxI+1) % 2 == 0)
+				{value = (fitsort.get(maxI/2).f+fitsort.get(maxI/2+1).f)/2.0;}
+				else
+				{value = fitsort.get(maxI/2).f;}
+			}
+			for (size_t i = 0 ; i < dim ; i++)
+			{
+				diag[i] = diag[i]/meandiag;
+				weight[i] = 2.0002 * value / diag[i];
+			}
+			if (validfitval == 1.0 && step-last_restart > 2)
+			{
+				iniphase = false;
+			}
+		}
+	}
+
+	if(idx_any)
+	{
+		tx = xmean - tx;
+		for(size_t i = 0 ; i < dim ; i++)
+		{
+			idx[i] = (idx[i] && (fabs(tx(i)) > 3.0*std::max(1.0,sqrt(dim)/mu_eff) * sigma * sqrt(diag[i])));
+			idx[i] = (idx[i] && (std::copysign(1.0,tx(i)) == std::copysign(1.0,(xmean(i)-xold(i)))) );
+		}
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (idx[i] == true)
+			{
+				weight[i] = pow(1.2,(std::max(1.0,mu_eff/10.0/dim)))*weight[i];
+			}
+		}
+	}
+	double arpenalty[lambda];
+	for (size_t i = 0 ; i < lambda ; i++)
+	{
+		arpenalty[i] = 0.0;
+		for (size_t j = 0 ; j < dim ; j++)
+		{
+			arpenalty[i] += weight[j] * (arxvalid[i](j) - arx[i](j))*(arxvalid[i](j) - arx[i](j));
+		}
+		f_obj.get(i).f += arpenalty[i];
+	}
+//	fitness%sel = fitness%raw + bnd%arpenalty;
+}
+
+double adjust_sigma(double sigma, Eigen::MatrixXd & C)
+{
+	for (size_t i = 0 ; i < dim ; i++)
+	{
+		if (sigma*sqrt(C(i,i)) > 5.0)
+		{sigma = 5.0/sqrt(C(i,i));}
+	}
+
+	return sigma;
+}
+
+
+void cma_step(particle_type & vd, int step,  double & best,
+			  int & best_i, openfpm::vector<double> & best_sol,
+			  size_t & fun_eval)
+{
+	size_t fe = 0;
+	Eigen::VectorXd xmean(dim);
+	Eigen::VectorXd mean_z(dim);
+	Eigen::VectorXd arxvalid[lambda];
+	Eigen::VectorXd arx[lambda];
+
+	for (size_t i = 0 ; i < lambda ; i++)
+	{
+		arx[i].resize(dim);
+		arxvalid[i].resize(dim);
+	}
+
+	double best_sample = std::numeric_limits<double>::max();
+	openfpm::vector<double> best_sample_sol(dim);
+
+	openfpm::vector<fun_index> f_obj(lambda);
+
+	int counteval = step*lambda;
+
+	auto it = vd.getDomainIterator();
+	while (it.isNext())
+	{
+		auto p = it.get();
+
+		if (vd.getProp<stop>(p) == true)
+		{++it;continue;}
+
+		Eigen::VectorXd (& arz)[lambda] = vd.getProp<Zeta>(p);
+
+		// fill the mean vector;
+
+		fill_vector(vd.getPos(p),xmean);
+
+		for (size_t j = 0 ; j < lambda ; j++)
+		{
+			vd.getProp<Zeta>(p)[j] = generateGaussianVector<dim>();
+			arx[j] = xmean + vd.getProp<sigma>(p)*vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<Zeta>(p)[j];
+
+			// sample point has to be inside -5.0 and 5.0
+			for (size_t i = 0 ; i < dim ; i++)
+			{
+				if (arx[j](i) < -5.0)
+				{arxvalid[j](i) = -5.0;}
+				else if (arx[j](i) > 5.0)
+				{arxvalid[j](i) = 5.0;}
+				else
+				{arxvalid[j](i) = arx[j](i);}
+			}
+
+			f_obj.get(j).f = hybrid_composition<dim>(arxvalid[j]);
+			f_obj.get(j).id = j;
+			fe++;
+
+			// Get the best ever
+			if (f_obj.get(j).f < best_sample)
+			{
+				best_sample = f_obj.get(j).f;
+
+			    // Copy the new mean as position of the particle
+			    for (size_t i = 0 ; i < dim ; i++)
+			    {best_sample_sol.get(i) = arxvalid[j](i);}
+			}
+		}
+
+		// Add penalities for out of bound points
+		cmaes_handlebounds(f_obj,vd.getProp<sigma>(p),
+						   vd.getProp<validfit>(p),arxvalid,
+						   arx,vd.getProp<Cov_m>(p),
+						   xmean,vd.getProp<xold>(p),vd.getProp<weight>(p),
+						   vd.getProp<fithist>(p),vd.getProp<iniphase>(p),
+						   vd.getProp<validfit>(p),mu_eff,
+						   step,vd.getProp<last_restart>(p));
+
+		f_obj.sort();
+
+		for (size_t j = 0 ; j < lambda ; j++)
+		{vd.getProp<ord>(p)[j] = f_obj.get(j).id;}
+
+		vd.getProp<xold>(p) = xmean;
+
+		// Calculate weighted mean
+
+		xmean.setZero();
+		mean_z.setZero();
+		for (size_t j = 0 ; j < mu ; j++)
+		{
+			xmean += weight_sample(j)*arx[vd.getProp<ord>(p)[j]];
+			mean_z += weight_sample(j)*vd.getProp<Zeta>(p)[vd.getProp<ord>(p)[j]];
+		}
+
+		vd.getProp<xmean_st>(p) = xmean;
+		vd.getProp<meanz_st>(p) = mean_z;
+
+		++it;
+	}
+
+	// Find the best point across processors
+	broadcast_best_solution(vd,best_sol,best,best_sample,best_sample_sol);
+
+	// bool calculate B and D
+	bool calc_bd = counteval - eigeneval > lambda/(ccov)/dim/10;
+	if (calc_bd == true)
+	{eigeneval = counteval;}
+
+	auto it2 = vd.getDomainIterator();
+	while (it2.isNext())
+	{
+		auto p = it2.get();
+
+		if (vd.getProp<stop>(p) == true)
+		{++it2;continue;}
+
+		xmean = vd.getProp<xmean_st>(p);
+		mean_z = vd.getProp<meanz_st>(p);
+
+		vd.getProp<path_s>(p) = vd.getProp<path_s>(p)*(1.0 - cs) + sqrt(cs*(2.0-cs)*mu_eff)*vd.getProp<B>(p)*mean_z;
+
+		double hsig = vd.getProp<path_s>(p).norm()/sqrt(1.0-pow((1.0-cs),(2.0*double((step-vd.getProp<last_restart>(p))))))/chiN < 1.4 + 2.0/(dim+1);
+
+		vd.getProp<path_c>(p) = (1-cc)*vd.getProp<path_c>(p) + hsig * sqrt(cc*(2-cc)*mu_eff)*(vd.getProp<B>(p)*vd.getProp<D>(p)*mean_z);
+
+		if (step % N_pso == 0)
+		{
+			Eigen::MatrixXd C_pso(dim,dim);
+			updatePso(best_sol,vd.getProp<sigma>(p),xmean,vd.getProp<xold>(p),vd.getProp<B>(p),vd.getProp<D>(p),C_pso);
+
+			// Adapt covariance matrix C
+			vd.getProp<Cov_m>(p) = (1.0-ccov+(1.0-hsig)*ccov*cc*(2.0-cc)/mu_eff)*vd.getProp<Cov_m>(p) +
+									ccov*(1.0/mu_eff)*(vd.getProp<path_c>(p)*vd.getProp<path_c>(p).transpose());
+
+			for (size_t i = 0 ; i < mu ; i++)
+			{vd.getProp<Cov_m>(p) += ccov*(1.0-1.0/mu_eff)*(vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<Zeta>(p)[vd.getProp<ord>(p)[i]])*weight_sample(i)*
+										  (vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<Zeta>(p)[vd.getProp<ord>(p)[i]]).transpose();
+			}
+
+	    	vd.getProp<Cov_m>(p) = psoWeight*vd.getProp<Cov_m>(p) + (1.0 - psoWeight)*C_pso;
+	    }
+	    else
+	    {
+			// Adapt covariance matrix C
+			vd.getProp<Cov_m>(p) = (1.0-ccov+(1.0-hsig)*ccov*cc*(2.0-cc)/mu_eff)*vd.getProp<Cov_m>(p) +
+									ccov*(1.0/mu_eff)*(vd.getProp<path_c>(p)*vd.getProp<path_c>(p).transpose());
+
+			for (size_t i = 0 ; i < mu ; i++)
+			{vd.getProp<Cov_m>(p) += ccov*(1.0-1.0/mu_eff)*(vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<Zeta>(p)[vd.getProp<ord>(p)[i]])*weight_sample(i)*
+				                          (vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<Zeta>(p)[vd.getProp<ord>(p)[i]]).transpose();
+			}
+	    }
+
+		// Numeric error
+
+		double smaller = std::numeric_limits<double>::max();
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (vd.getProp<sigma>(p)*sqrt(vd.getProp<D>(p).diagonal()[i]) > 5.0)
+			{
+				if (smaller > 5.0/sqrt(vd.getProp<D>(p).diagonal()[i]))
+				{smaller = 5.0/sqrt(vd.getProp<D>(p).diagonal()[i]);}
+			}
+		}
+		if (smaller != std::numeric_limits<double>::max())
+		{vd.getProp<sigma>(p) = smaller;}
+
+		//Adapt step-size sigma
+		vd.getProp<sigma>(p) = vd.getProp<sigma>(p)*exp((cs/d_amps)*(vd.getProp<path_s>(p).norm()/chiN - 1));
+
+		// Update B and D from C
+
+		if (calc_bd)
+		{
+			Eigen::MatrixXd trUp = vd.getProp<Cov_m>(p).triangularView<Eigen::Upper>();
+			Eigen::MatrixXd trDw = vd.getProp<Cov_m>(p).triangularView<Eigen::StrictlyUpper>();
+			vd.getProp<Cov_m>(p) = trUp + trDw.transpose();
+
+			// Eigen decomposition
+			Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eig_solver;
+
+			eig_solver.compute(vd.getProp<Cov_m>(p));
+
+			for (size_t i = 0 ; i < eig_solver.eigenvalues().size() ; i++)
+			{vd.getProp<D>(p).diagonal()[i] = sqrt(eig_solver.eigenvalues()[i]);}
+			vd.getProp<B>(p) = eig_solver.eigenvectors();
+
+			// Make first component always positive
+			for (size_t i = 0 ; i < dim ; i++)
+			{
+				if (vd.getProp<B>(p)(0,i) < 0)
+				{vd.getProp<B>(p).col(i) = - vd.getProp<B>(p).col(i);}
+			}
+
+			Eigen::MatrixXd tmp = vd.getProp<B>(p).transpose();
+		}
+
+	    // Copy the new mean as position of the particle
+	    for (size_t i = 0 ; i < dim ; i++)
+	    {vd.getPos(p)[i] = xmean(i);}
+
+	    vd.getProp<sigma>(p) = adjust_sigma(vd.getProp<sigma>(p),vd.getProp<Cov_m>(p));
+
+	    // Stop conditions
+	    bool stop_tol = true;
+	    bool stop_tolX = true;
+	    for (size_t i = 0 ; i < dim ; i++)
+	    {
+	    	stop_tol &= (vd.getProp<sigma>(p)*std::max(fabs(vd.getProp<path_c>(p)(i)),sqrt(vd.getProp<Cov_m>(p)(i,i)))) < stopTolX;
+	    	stop_tolX &= vd.getProp<sigma>(p)*sqrt(vd.getProp<D>(p).diagonal()[i]) > stopTolUpX;
+	    }
+
+	    vd.getProp<stop>(p) = stop_tol | stop_tolX;
+
+	    // Escape flat fitness, or better terminate?
+	    if (f_obj.get(0).f == f_obj.get(std::ceil(0.7*lambda)).f )
+	    {
+	    	vd.getProp<sigma>(p) = vd.getProp<sigma>(p)*exp(0.2+cs/d_amps);
+	    	std::cout << "warning: flat fitness, consider reformulating the objective";
+
+	    	// Stop it
+	    	vd.getProp<stop>(p) = true;
+	    }
+
+	    if (vd.getProp<stop>(p) == true)
+	    {std::cout << "Stopped" << std::endl;}
+
+	    if (restart_cma && vd.getProp<stop>(p) == true)
+	    {
+	    	std::cout << "------- Restart #" << std::endl;
+
+	    	std::cout << "---------------------------------" << std::endl;
+	    	std::cout << "Best: " << best << "   " << fun_eval << std::endl;
+	        std::cout << "---------------------------------" << std::endl;
+
+	        vd.getProp<last_restart>(p) = step;
+	        vd.getProp<xold>(p).setZero();
+
+			for (size_t i = 0 ; i < vd.getProp<D>(p).diagonal().size() ; i++)
+			{vd.getProp<D>(p).diagonal()[i] = 1.0;}
+			vd.getProp<B>(p).resize(dim,dim);
+			vd.getProp<B>(p).setIdentity();
+			vd.getProp<Cov_m>(p) = vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<D>(p)*vd.getProp<B>(p);
+			vd.getProp<path_s>(p).resize(dim);
+			vd.getProp<path_s>(p).setZero(dim);
+			vd.getProp<path_c>(p).resize(dim);
+			vd.getProp<path_c>(p).setZero(dim);
+			vd.getProp<stop>(p) = false;
+			vd.getProp<iniphase>(p) = true;
+			vd.getProp<last_restart>(p) = 0;
+			vd.getProp<sigma>(p) = 2.0;
+
+			// a different point in space
+			for (size_t i = 0 ; i < dim ; i++)
+			{
+				// we define x, assign a random position between 0.0 and 1.0
+				vd.getPos(p)[i] = 10.0*(double)rand() / RAND_MAX - 5.0;
+			}
+
+			// Initialize the bound history
+
+			for (size_t i = 0 ; i < hist_size ; i++)
+			{vd.getProp<fithist>(p)[i] = -1.0;}
+			vd.getProp<fithist>(p)[0] = 1.0;
+			vd.getProp<validfit>(p) = 0.0;
+		}
+
+		++it2;
+	}
+
+	auto & v_cl = create_vcluster();
+	v_cl.sum(fe);
+	v_cl.execute();
+
+	fun_eval += fe;
+}
+
+
+
+int main(int argc, char* argv[])
+{
+    // initialize the library
+	openfpm_init(&argc,&argv);
+
+	auto & v_cl = create_vcluster();
+
+	// Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
+	Box<dim,double> domain;
+
+	for (size_t i = 0 ; i < dim ; i++)
+	{
+		domain.setLow(i,0.0);
+		domain.setHigh(i,1.0);
+	}
+
+	// Here we define the boundary conditions of our problem
+	size_t bc[dim];
+	for (size_t i = 0 ; i < dim ; i++)
+    {bc[i] = NON_PERIODIC;};
+
+	prepare_f15<dim>();
+
+	// extended boundary around the domain, and the processor domain
+	Ghost<dim,double> g(0.0);
+
+    particle_type vd(16,domain,bc,g);
+
+    // Initialize constants
+
+    stop_fitness = 1e-10;
+    size_t stopeval = 1e3*dim*dim;
+
+    // Strategy parameter setting: Selection
+    init_weight();
+
+    // Strategy parameter setting: Adaptation
+    cc = 4.0 / (dim+4.0);
+    cs = (mu_eff+2.0) / (double(dim)+mu_eff+3.0);
+    ccov = (1.0/mu_eff) * 2.0/((dim+1.41)*(dim+1.41)) +
+    	   (1.0 - 1.0/mu_eff)* std::min(1.0,(2.0*mu_eff-1.0)/((dim+2.0)*(dim+2.0) + mu_eff));
+    d_amps = 1 + 2*std::max(0.0, sqrt((mu_eff-1.0)/(dim+1))-1) + cs;
+
+    chiN = sqrt(dim)*(1.0-1.0/(4.0*dim)+1.0/(21.0*dim*dim));
+
+	//! \cond [assign position] \endcond
+
+
+	// initialize the srand
+	int seed = 24756*v_cl.rank()*v_cl.rank() + time(NULL);
+	srand(seed);
+
+	auto it = vd.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto p = it.get();
+
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			// we define x, assign a random position between 0.0 and 1.0
+			vd.getPos(p)[i] = 10.0*(double)rand() / RAND_MAX - 5.0;
+		}
+
+		vd.getProp<sigma>(p) = 2.0;
+
+		// Initialize the covariant Matrix,B and D to identity
+
+		vd.getProp<D>(p).resize(dim);
+		for (size_t i = 0 ; i < vd.getProp<D>(p).diagonal().size() ; i++)
+		{vd.getProp<D>(p).diagonal()[i] = 1.0;}
+		vd.getProp<B>(p).resize(dim,dim);
+		vd.getProp<B>(p).setIdentity();
+		vd.getProp<Cov_m>(p) = vd.getProp<B>(p)*vd.getProp<D>(p)*vd.getProp<D>(p)*vd.getProp<B>(p);
+		vd.getProp<path_s>(p).resize(dim);
+		vd.getProp<path_s>(p).setZero(dim);
+		vd.getProp<path_c>(p).resize(dim);
+		vd.getProp<path_c>(p).setZero(dim);
+		vd.getProp<stop>(p) = false;
+		vd.getProp<iniphase>(p) = true;
+		vd.getProp<last_restart>(p) = 0;
+
+		// Initialize the bound history
+
+		for (size_t i = 0 ; i < hist_size ; i++)
+		{vd.getProp<fithist>(p)[i] = -1.0;}
+		vd.getProp<fithist>(p)[0] = 1.0;
+		vd.getProp<validfit>(p) = 0.0;
+
+		// next particle
+		++it;
+	}
+
+	if (v_cl.rank() == 0)
+	{std::cout << "Starting PS-CMA-ES" << std::endl;}
+
+	double best = 0.0;
+	int best_i = 0;
+
+	best = std::numeric_limits<double>::max();
+	openfpm::vector<double> best_sol(dim);
+	// now do several iteration
+
+	int stop_cond = 0;
+	size_t fun_eval = 0;
+	int i = 0;
+	while (fun_eval < max_fun_eval && best > 120.000001)
+	{
+		// sample offspring
+		cma_step(vd,i+1,best,best_i,best_sol,fun_eval);
+
+		i++;
+	}
+
+	if (v_cl.rank() == 0)
+	{
+		std::cout << "Best solution: " << best << " with " << fun_eval << std::endl;
+		std::cout << "at: " << std::endl;
+
+		for (size_t i = 0 ; i < best_sol.size() ; i++)
+		{
+			std::cout << best_sol.get(i) << " ";
+		}
+	}
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Vector_0_simple Vector 0 simple
+	 *
+	 * ## Full code ## {#code_e0_sim}
+	 *
+	 * \include Vector/0_simple/main.cpp
+	 *
+	 */
+}
diff --git a/example/Numerics/Vortex_in_cell/Makefile b/example/Numerics/Vortex_in_cell/Makefile
index 4767e9df8811a61d521b119b0d5e6016f3ddce76..9d1ae5f955fffa142fcb4032b4eddf53109dcd71 100644
--- a/example/Numerics/Vortex_in_cell/Makefile
+++ b/example/Numerics/Vortex_in_cell/Makefile
@@ -1,4 +1,4 @@
-include ../../../example.mk
+include ../../example.mk
 
 CC=mpic++
 
diff --git a/example/Numerics/Vortex_in_cell/main_vic_petsc.cpp b/example/Numerics/Vortex_in_cell/main_vic_petsc.cpp
index f9e1de78f5761012771a8a61c92facc075fbfd6f..2283fa4ac712af07030fc7f994226dc12f7ce55d 100644
--- a/example/Numerics/Vortex_in_cell/main_vic_petsc.cpp
+++ b/example/Numerics/Vortex_in_cell/main_vic_petsc.cpp
@@ -877,9 +877,9 @@ template<typename grid> void calc_rhs(grid & g_vort, grid & g_vel, grid & g_dwp)
 
 	// calculate several pre-factors for the stencil finite
 	// difference
-	float fac1 = 2.0f*nu/(g_vort.spacing(0)*g_vort.spacing(0));
-	float fac2 = 2.0f*nu/(g_vort.spacing(1)*g_vort.spacing(1));
-	float fac3 = 2.0f*nu/(g_vort.spacing(2)*g_vort.spacing(2));
+	float fac1 = 1.0f*nu/(g_vort.spacing(0)*g_vort.spacing(0));
+	float fac2 = 1.0f*nu/(g_vort.spacing(1)*g_vort.spacing(1));
+	float fac3 = 1.0f*nu/(g_vort.spacing(2)*g_vort.spacing(2));
 
 	float fac4 = 0.5f/(g_vort.spacing(0));
 	float fac5 = 0.5f/(g_vort.spacing(1));
diff --git a/example/VCluster/0_simple/main.cpp b/example/VCluster/0_simple/main.cpp
index af5dc17f0c0ae06a7bc4132c801df05261c667be..c0bab78450f7f099747519c88d7dd9ee65a9fb39 100644
--- a/example/VCluster/0_simple/main.cpp
+++ b/example/VCluster/0_simple/main.cpp
@@ -7,6 +7,7 @@
  *
  * \subpage VCluster_0_simple
  * \subpage VCluster_1_semantic
+ * \subpage VCluster_2_serial_and_parallel
  *
  */
 
diff --git a/example/Vector/7_SPH_dlb_opt/main.cpp b/example/Vector/7_SPH_dlb_opt/main.cpp
index d576c1a1d0f69647fc4ebe816c8c3585d6e0e43c..dff03e4625269cbb3f4d372fc76f765dc798dd8f 100644
--- a/example/Vector/7_SPH_dlb_opt/main.cpp
+++ b/example/Vector/7_SPH_dlb_opt/main.cpp
@@ -1118,6 +1118,7 @@ int main(int argc, char* argv[])
 		{
 			vd.deleteGhost();
 			vd.write_frame("Geometry",write,VTK_WRITER | FORMAT_BINARY);
+                        vd.getDecomposition().write("dec" + std::to_string(write));
 			vd.ghost_get<type,rho,Pressure,velocity>(SKIP_LABELLING);
 			write++;
 
diff --git a/images/vector.cpp b/images/vector.cpp
index 7b9e3361a137e05c2d546d1f5ccb87f160a7f11b..9414d2e6af4fba9846154c1212699ef32c544a36 100644
--- a/images/vector.cpp
+++ b/images/vector.cpp
@@ -34,6 +34,11 @@ public:
 
 #endif
 
+	static inline bool noPointers()
+	{
+		return true;
+	}
+
 };
 
 int main(int argc, char* argv[])
@@ -58,7 +63,7 @@ int main(int argc, char* argv[])
     size_t bc[2]={PERIODIC,PERIODIC};
 	Ghost<2,float> g(0.01);
 	
-	vector_dist<2,float, Particle<float>, CartDecomposition<2,float> > vd(4096,domain,bc,g);
+	vector_dist<2,float, Particle<float> > vd(4096,domain,bc,g);
 
 	auto it = vd.getIterator();
 
diff --git a/openfpm_data b/openfpm_data
index f7ca1bc2fe8eeb9c6e3bdf34898341ffa91f8c1c..6c2a5911ac16f93ab0ae1e7ac14723c952aa5c16 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit f7ca1bc2fe8eeb9c6e3bdf34898341ffa91f8c1c
+Subproject commit 6c2a5911ac16f93ab0ae1e7ac14723c952aa5c16
diff --git a/openfpm_devices b/openfpm_devices
index 2da3b22b477d8b94b60fb9eb5f1a4daacb6857b5..46e4994c5dff879a71e6ae090c50b2f23235d435 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit 2da3b22b477d8b94b60fb9eb5f1a4daacb6857b5
+Subproject commit 46e4994c5dff879a71e6ae090c50b2f23235d435
diff --git a/openfpm_io b/openfpm_io
index fac23ddd992dc17d82904bd5083f5235416c2255..89411e76fbed1ab098d2bfec13e7759aca51a14d 160000
--- a/openfpm_io
+++ b/openfpm_io
@@ -1 +1 @@
-Subproject commit fac23ddd992dc17d82904bd5083f5235416c2255
+Subproject commit 89411e76fbed1ab098d2bfec13e7759aca51a14d
diff --git a/openfpm_numerics b/openfpm_numerics
index 4e569e3bcec0ac24ebd0b2a30a1b7bf9b602497d..d9df4f304c897432f85291233218b036c8857523 160000
--- a/openfpm_numerics
+++ b/openfpm_numerics
@@ -1 +1 @@
-Subproject commit 4e569e3bcec0ac24ebd0b2a30a1b7bf9b602497d
+Subproject commit d9df4f304c897432f85291233218b036c8857523
diff --git a/openfpm_pdata.doc b/openfpm_pdata.doc
index afd54c557ca81549f429a12fb10d2e8989dde33f..e356c1f644c459d47927b845e8cb1a8d06b43103 100644
--- a/openfpm_pdata.doc
+++ b/openfpm_pdata.doc
@@ -38,7 +38,7 @@ PROJECT_NAME           = "OpenFPM_pdata"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 1.0.0
+PROJECT_NUMBER         = 1.1.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/openfpm_vcluster b/openfpm_vcluster
index a99918127f5835c31d2df4e9020efdeb46d07d66..dba6676c2638e01dc3b9277dbbd80ea3f46ea6d0 160000
--- a/openfpm_vcluster
+++ b/openfpm_vcluster
@@ -1 +1 @@
-Subproject commit a99918127f5835c31d2df4e9020efdeb46d07d66
+Subproject commit dba6676c2638e01dc3b9277dbbd80ea3f46ea6d0
diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp
index 8b91795fab6c39f9fd97465a915150c2d3b94e95..806cc50eb1ead1e5d8ec2597f1ec35760aec1b0b 100755
--- a/src/Decomposition/CartDecomposition.hpp
+++ b/src/Decomposition/CartDecomposition.hpp
@@ -41,6 +41,50 @@
 
 #define CARTDEC_ERROR 2000lu
 
+/*! \brief It spread the sub-sub-domain on a regular cartesian grid of size dim
+ *
+ * \warning this function only guarantee that the division on each direction is
+ *          2^n with some n and does not guarantee that the number of
+ *          sub-sub-domain is preserved
+ *
+ * \param div number of division on each direction as output
+ * \param n_sub number of sub-domain
+ * \param dim_r dimension reduction
+ *
+ */
+template<unsigned int dim> static void nsub_to_div2(size_t (& div)[dim], size_t n_sub, size_t dim_r)
+{
+	for (size_t i = 0; i < dim; i++)
+	{
+		if (i < dim_r)
+		{div[i] = openfpm::math::round_big_2(pow(n_sub, 1.0 / dim_r));}
+		else
+		{div[i] = 1;}
+	}
+}
+
+/*! \brief It spread the sub-sub-domain on a regular cartesian grid of size dim
+ *
+ * \warning this function only guarantee that the division on each direction is
+ *          2^n with some n and does not guarantee that the number of
+ *          sub-sub-domain is preserved
+ *
+ * \param div number of division on each direction as output
+ * \param n_sub number of sub-domain
+ * \param dim_r dimension reduction
+ *
+ */
+template<unsigned int dim> static void nsub_to_div(size_t (& div)[dim], size_t n_sub, size_t dim_r)
+{
+	for (size_t i = 0; i < dim; i++)
+	{
+		if (i < dim_r)
+		{div[i] = std::floor(pow(n_sub, 1.0 / dim_r));}
+		else
+		{div[i] = 1;}
+	}
+}
+
 #define COMPUTE_SKIN_SUB 1
 
 /**
@@ -352,13 +396,13 @@ public:
 			// calculate the sub-divisions
 			size_t div[dim];
 			for (size_t i = 0; i < dim; i++)
-				div[i] = (size_t) ((bound.getHigh(i) - bound.getLow(i)) / cd.getCellBox().getP2()[i]);
+			{div[i] = (size_t) ((bound.getHigh(i) - bound.getLow(i)) / cd.getCellBox().getP2()[i]);}
 
 			// Initialize the geo_cell structure
 			ie_ghost<dim,T>::Initialize_geo_cell(bound,div);
 
 			// Initialize shift vectors
-			ie_ghost<dim,T>::generateShiftVectors(domain);
+			ie_ghost<dim,T>::generateShiftVectors(domain,bc);
 		}
 	}
 
@@ -512,9 +556,7 @@ public:
 	 \endverbatim
 
 	 *
-	 *
-	 *
-	 * \param ghost margins for each dimensions (p1 negative part) (p2 positive part)
+	 * ghost margins for each dimensions (p1 negative part) (p2 positive part)
 	 *
 	 *
 	 \verbatim
@@ -798,6 +840,9 @@ public:
 		cart.box_nn_processor = box_nn_processor;
 		cart.fine_s = fine_s;
 		cart.gr = gr;
+		cart.gr_dist = gr_dist;
+		cart.dist = dist;
+		cart.commCostSet = commCostSet;
 		cart.cd = cd;
 		cart.domain = domain;
 		for (size_t i = 0 ; i < dim ; i++)
@@ -830,11 +875,17 @@ public:
 		box_nn_processor = cart.box_nn_processor;
 		fine_s = cart.fine_s;
 		gr = cart.gr;
+		gr_dist = cart.gr_dist;
+		dist = cart.dist;
+		commCostSet = cart.commCostSet;
 		cd = cart.cd;
 		domain = cart.domain;
 
 		for (size_t i = 0 ; i < dim ; i++)
-		{spacing[i] = cart.spacing[i];};
+		{
+			spacing[i] = cart.spacing[i];
+			magn[i] = cart.magn[i];
+		};
 
 		ghost = cart.ghost;
 
@@ -863,10 +914,16 @@ public:
 		box_nn_processor.swap(cart.box_nn_processor);
 		fine_s.swap(cart.fine_s);
 		gr = cart.gr;
+		gr_dist = cart.gr_dist;
+		dist = cart.dist;
+		commCostSet = cart.commCostSet;
 		cd = cart.cd;
 		domain = cart.domain;
 		for (size_t i = 0 ; i < dim ; i++)
-		{spacing[i] = cart.spacing[i];};
+		{
+			spacing[i] = cart.spacing[i];
+			magn[i] = cart.magn[i];
+		};
 
 		ghost = cart.ghost;
 
@@ -1030,6 +1087,77 @@ public:
 		}
 	}
 
+	/*! \brief Set the best parameters for the decomposition
+	 *
+	 * It based on number of processors and dimensionality find a "good" parameter setting
+	 *
+	 * \param domain_ domain to decompose
+	 * \param bc boundary conditions
+	 * \param ghost Ghost size
+	 * \param sec_dist Distribution grid. The distribution grid help in reducing the underlying
+	 *                 distribution problem simplifying decomposition problem. This is done in order to
+	 *                 reduce the load/balancing dynamic load balancing problem
+	 *
+	 * \param dec_gran number of sub-sub-domain for each processor
+	 *
+	 */
+	void setGoodParameters(::Box<dim,T> domain_,
+						   const size_t (& bc)[dim],
+						   const Ghost<dim,T> & ghost,
+						   size_t dec_gran,
+						   const grid_sm<dim,void> & sec_dist = grid_sm<dim,void>())
+	{
+		size_t div[dim];
+
+		// Create a valid decomposition of the space
+		// Get the number of processor and calculate the number of sub-domain
+		// for decomposition
+		size_t n_proc = v_cl.getProcessingUnits();
+		size_t n_sub = n_proc * dec_gran;
+
+		// Calculate the maximum number (before merging) of sub-domain on
+		// each dimension
+
+		nsub_to_div2(div,n_sub,dim);
+
+/*		for (size_t i = 0; i < dim; i++)
+		{
+			div[i] = openfpm::math::round_big_2(pow(n_sub, 1.0 / dim));
+		}*/
+
+		if (dim > 3)
+		{
+			long int dim_r = dim-1;
+			do
+			{
+				// Check for adjustment
+				size_t tot_size = 1;
+				for (size_t i = 0 ; i < dim ; i++)
+				{tot_size *= div[i];}
+
+				// the granularity is too coarse increase the divisions
+				if (tot_size / n_proc > 0.75*dec_gran )
+				{break;}
+
+				nsub_to_div(div,n_sub,dim_r);
+
+				dim_r--;
+			} while(dim_r > 0);
+		}
+
+		setParameters(div,domain_,bc,ghost,sec_dist);
+	}
+
+	/*! \brief return the parameters of the decomposition
+	 *
+	 * \param div_ number of divisions in each dimension
+	 *
+	 */
+	void getParameters(size_t (& div_)[dim])
+	{
+		for (size_t i = 0 ; i < dim ; i++)
+		{div_[i] = this->gr.size(i);}
+	}
 
 	/*! \brief Set the parameter of the decomposition
 	 *
@@ -1042,7 +1170,11 @@ public:
 	 *                 reduce the load/balancing dynamic load balancing problem
 	 *
 	 */
-	void setParameters(const size_t (& div_)[dim], ::Box<dim,T> domain_, const size_t (& bc)[dim] ,const Ghost<dim,T> & ghost, const grid_sm<dim,void> & sec_dist = grid_sm<dim,void>())
+	void setParameters(const size_t (& div_)[dim],
+					   ::Box<dim,T> domain_,
+						const size_t (& bc)[dim],
+						const Ghost<dim,T> & ghost,
+						const grid_sm<dim,void> & sec_dist = grid_sm<dim,void>())
 	{
 		// set the boundary conditions
 		for (size_t i = 0 ; i < dim ; i++)
@@ -1096,7 +1228,7 @@ public:
 		reset();
 
 		if (commCostSet == false)
-			computeCommunicationAndMigrationCosts(1);
+		{computeCommunicationAndMigrationCosts(1);}
 
 		dist.decompose();
 
@@ -1118,7 +1250,7 @@ public:
 		reset();
 
 		if (commCostSet == false)
-			computeCommunicationAndMigrationCosts(ts);
+		{computeCommunicationAndMigrationCosts(ts);}
 
 		dist.refine();
 
@@ -1140,7 +1272,7 @@ public:
 		reset();
 
 		if (commCostSet == false)
-			computeCommunicationAndMigrationCosts(ts);
+		{computeCommunicationAndMigrationCosts(ts);}
 
 		dist.redecompose();
 
diff --git a/src/Decomposition/Distribution/DistParMetisDistribution.hpp b/src/Decomposition/Distribution/DistParMetisDistribution.hpp
index f96a345c3b647eda74736ba4cfecc5735bd67053..d20e71fc302af7c14eacdc7324bd56087e14ef2c 100644
--- a/src/Decomposition/Distribution/DistParMetisDistribution.hpp
+++ b/src/Decomposition/Distribution/DistParMetisDistribution.hpp
@@ -272,9 +272,7 @@ public:
 
 	/*! \brief return number of moved vertices in all iterations so far
 	 *
-	 * \param id vertex id
-	 *
-	 * \return vector with x, y, z
+	 * \return number of moved vertices
 	 *
 	 */
 	size_t getMaxMovedV()
diff --git a/src/Decomposition/Distribution/Distribution_unit_tests.hpp b/src/Decomposition/Distribution/Distribution_unit_tests.hpp
index 37a9944f9a97d5d8276acaff2be74952dadb96cc..1c527d5b6ee2bde0f96a192aa31991513cfe22f9 100644
--- a/src/Decomposition/Distribution/Distribution_unit_tests.hpp
+++ b/src/Decomposition/Distribution/Distribution_unit_tests.hpp
@@ -422,11 +422,6 @@ BOOST_AUTO_TEST_CASE( Space_distribution_test)
 	//! [refine with dist_parmetis the decomposition]
 }
 
-void print_test_v(std::string test, size_t sz)
-{
-	if (create_vcluster().getProcessUnitID() == 0)
-		std::cout << test << " " << sz << "\n";
-}
 
 BOOST_AUTO_TEST_SUITE_END()
 
diff --git a/src/Decomposition/Distribution/ParMetisDistribution.hpp b/src/Decomposition/Distribution/ParMetisDistribution.hpp
index 9ee2435da83384ec736f57779d4e21ea8bd5b9c9..cacf912a91b66389ab380a7259ac08162fefdba2 100644
--- a/src/Decomposition/Distribution/ParMetisDistribution.hpp
+++ b/src/Decomposition/Distribution/ParMetisDistribution.hpp
@@ -304,6 +304,7 @@ public:
 	 *
 	 */
 	ParMetisDistribution(ParMetisDistribution<dim,T> && pm)
+	:v_cl(pm.v_cl)
 	{
 		this->operator=(pm);
 	}
@@ -638,6 +639,7 @@ public:
 		verticesGotWeights = dist.verticesGotWeights;
 		sub_sub_owner = dist.sub_sub_owner;
 		m2g = dist.m2g;
+		parmetis_graph = dist.parmetis_graph;
 
 		return *this;
 	}
@@ -655,6 +657,7 @@ public:
 		verticesGotWeights = dist.verticesGotWeights;
 		sub_sub_owner.swap(dist.sub_sub_owner);
 		m2g.swap(dist.m2g);
+		parmetis_graph = dist.parmetis_graph;
 
 		return *this;
 	}
diff --git a/src/Decomposition/Distribution/SpaceDistribution.hpp b/src/Decomposition/Distribution/SpaceDistribution.hpp
index 2aa6afc52a297e75f3aab6726b054170e0f62dae..26343e14b2f32cd423e86cab88d8ac743a18586d 100644
--- a/src/Decomposition/Distribution/SpaceDistribution.hpp
+++ b/src/Decomposition/Distribution/SpaceDistribution.hpp
@@ -10,6 +10,7 @@
 
 #include "util/mathutil.hpp"
 #include "NN/CellList/CellDecomposer.hpp"
+#include "Grid/grid_key_dx_iterator_hilbert.hpp"
 
 /*! \brief Class that distribute sub-sub-domains across processors using an hilbert curve
  *         to divide the space
@@ -63,6 +64,7 @@ public:
 	 *
 	 */
 	SpaceDistribution(SpaceDistribution<dim,T> && pm)
+	:v_cl(pm.v_cl)
 	{
 		this->operator=(pm);
 	}
diff --git a/src/Decomposition/Distribution/parmetis_util.hpp b/src/Decomposition/Distribution/parmetis_util.hpp
index c555326f4a20ef04b67f0e57b04b8bf93696df91..c34bc34a5e65797b548e1b2cd22073deecf6ee36 100755
--- a/src/Decomposition/Distribution/parmetis_util.hpp
+++ b/src/Decomposition/Distribution/parmetis_util.hpp
@@ -526,10 +526,11 @@ public:
 	 */
 	const Parmetis<Graph> & operator=(const Parmetis<Graph> & pm)
 	{
-		comm = pm.comm;
-		v_cl = pm.v_cl;
+		MPI_Comm_dup(pm.comm, &comm);
 		p_id = pm.p_id;
 		nc = pm.nc;
+		n_dec = pm.n_dec;
+		dist_tol = pm.dist_tol;
 
 		setDefaultParameters(pm.Mg.wgtflag[0] == 3);
 
@@ -545,10 +546,12 @@ public:
 	 */
 	const Parmetis<Graph> & operator=(Parmetis<Graph> && pm)
 	{
-		comm = pm.comm;
-		v_cl = pm.v_cl;
+		// TODO Move into VCluster
+		MPI_Comm_dup(pm.comm, &comm);
 		p_id = pm.p_id;
 		nc = pm.nc;
+		n_dec = pm.n_dec;
+		dist_tol = pm.dist_tol;
 
 		setDefaultParameters(pm.Mg.wgtflag[0] == 3);
 
diff --git a/src/Decomposition/Domain_NN_calculator_cart.hpp b/src/Decomposition/Domain_NN_calculator_cart.hpp
index 082c53d80a41423d0fb144693a23e035bedcfc67..8f795bbc04b61cc73b38d79edf20173b08d74429 100644
--- a/src/Decomposition/Domain_NN_calculator_cart.hpp
+++ b/src/Decomposition/Domain_NN_calculator_cart.hpp
@@ -121,7 +121,7 @@ class domain_nn_calculator_cart
 		// +2 is padding
 
 		for (size_t j = 0 ; j < dim ; j++)
-			sz[j] = proc_box.getHigh(j) - proc_box.getLow(j) + 2 + 1;
+		{sz[j] = proc_box.getHigh(j) - proc_box.getLow(j) + 2 + 1;}
 
 		gs.setDimensions(sz);
 
@@ -130,7 +130,7 @@ class domain_nn_calculator_cart
 		g.setMemory();
 
 		for (size_t i = 0 ; i < dim ; i++)
-			one.set_d(i,1);
+		{one.set_d(i,1);}
 
 		// Calculate the csr neighborhood
 		openfpm::vector<std::pair<grid_key_dx<dim>,grid_key_dx<dim>>> csr;
@@ -191,7 +191,7 @@ class domain_nn_calculator_cart
 				sub_keys.last().NN_subsub.resize(g.template get<0>(key).size());
 
 				for (size_t i = 0 ; i < g.template get<0>(key).size() ; i++)
-					sub_keys.last().NN_subsub.get(i) = g.template get<0>(key).get(i) - one;
+				{sub_keys.last().NN_subsub.get(i) = g.template get<0>(key).get(i) - one;}
 			}
 
 			++it;
diff --git a/src/Decomposition/ORB.hpp b/src/Decomposition/ORB.hpp
index d8efe63aabc831ab17bede98c4bd5b7bf847eecd..e9422298e8402de9d23c2433946798b959dd4d5a 100755
--- a/src/Decomposition/ORB.hpp
+++ b/src/Decomposition/ORB.hpp
@@ -8,7 +8,6 @@
 #ifndef ORB_HPP_
 #define ORB_HPP_
 
-#include "data_type/scalar.hpp"
 #include "util/mathutil.hpp"
 
 /*! \brief this class is a functor for "for_each" algorithm
@@ -77,7 +76,7 @@ struct do_when_dim_gr_i<dim,i,ORB,typename boost::enable_if< boost::mpl::bool_<(
  *
  */
 
-template<typename T> class ORB_node : public scalar<T>
+template<typename T> class ORB_node : public aggregate<T>
 {
 public:
 
diff --git a/src/Decomposition/common.hpp b/src/Decomposition/common.hpp
index f9cdfeb61108b4d7c52edd10bbe14544b7c6399a..5a920679a1f1ea284d3e5a2d0f77577c71957196 100755
--- a/src/Decomposition/common.hpp
+++ b/src/Decomposition/common.hpp
@@ -83,6 +83,12 @@ struct Box_sub
 
 	//! see ie_ghost follow sector explanation
 	comb<dim> cmb;
+
+	//! Constructor reset cmb
+	Box_sub()
+	{
+		cmb.zero();
+	}
 };
 
 //! Particular case for local internal ghost boxes
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index 2847d8c747287fcffcef0cfc5dca99c90a2eec7e..7863bdb8092a97084ff4926b7b9b56e18bac26ab 100755
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -10,6 +10,8 @@
 
 #include "common.hpp"
 #include "nn_processor.hpp"
+#include "Decomposition/shift_vect_converter.hpp"
+
 
 /*! \brief structure that store and compute the internal and external local ghost box
  *
@@ -38,7 +40,7 @@ class ie_ghost
 	openfpm::vector<p_box<dim,T> > vb_int;
 
 	//! Cell-list that store the geometrical information of the internal ghost boxes
-	CellList<dim,T,Mem_fast,shift<dim,T>> geo_cell;
+	CellList<dim,T,Mem_fast<>,shift<dim,T>> geo_cell;
 
 	//! shift vectors
 	openfpm::vector<Point<dim,T>> shifts;
@@ -49,6 +51,8 @@ class ie_ghost
 	//! Temporal buffers to return temporal information
 	openfpm::vector<size_t> ids;
 
+	//! shift converter
+	shift_vect_converter<dim,T> sc_convert;
 
 	/*! \brief Given a local sub-domain i, it give the id of such sub-domain in the sent list
 	 *         for the processor p_id
@@ -153,35 +157,9 @@ protected:
 	 * \param domain box that describe the domain
 	 *
 	 */
-	void generateShiftVectors(const Box<dim,T> & domain)
+	void generateShiftVectors(const Box<dim,T> & domain, size_t (& bc)[dim])
 	{
-		shifts.resize(openfpm::math::pow(3,dim));
-
-		HyperCube<dim> hyp;
-
-		for (long int i = dim-1 ; i >= 0 ; i--)
-		{
-			std::vector<comb<dim>> cmbs = hyp.getCombinations_R(i);
-
-			for (size_t j = 0 ; j < cmbs.size() ; j++)
-			{
-				for (size_t k = 0 ; k < dim ; k++)
-				{
-					switch (cmbs[j][k])
-					{
-					case 1:
-						shifts.get(cmbs[j].lin()).template get<0>()[k] = -(domain.getHigh(k) - domain.getLow(k));
-						break;
-					case 0:
-						shifts.get(cmbs[j].lin()).template get<0>()[k] = 0;
-						break;
-					case -1:
-						shifts.get(cmbs[j].lin()).template get<0>()[k] = (domain.getHigh(k) - domain.getLow(k));
-						break;
-					}
-				}
-			}
-		}
+		sc_convert.generateShiftVectors(domain,bc,shifts);
 	}
 
 	/*! \brief Initialize the geo cell list structure
@@ -196,7 +174,7 @@ protected:
 	void Initialize_geo_cell(const Box<dim,T> & domain, const size_t (&div)[dim])
 	{
 		// Initialize the geo_cell structure
-		geo_cell.Initialize(domain,div);
+		geo_cell.Initialize(domain,div,0);
 	}
 
 	/*! \brief Create the box_nn_processor_int (bx part)  structure
@@ -371,7 +349,7 @@ protected:
 						b_int.lc_proc = lc_proc;
 
 						// fill the shift id
-						b_int.shift_id = nn_p_box_pos.get(k).lin();
+						b_int.shift_id = convertShift(nn_p_box_pos.get(k));
 
 						//
 						// Updating
@@ -531,6 +509,20 @@ public:
 		return shifts;
 	}
 
+	/*! It return the converted shift vector
+	 *
+	 * In high dimensions the number of shifts vectors explode exponentially, so we are
+	 * expecting that some of the boundary is non periodic to reduce the numbers of shift
+	 * vectors
+	 *
+	 * \return the shift vectors
+	 *
+	 */
+	size_t convertShift(const comb<dim> & cmb)
+	{
+		return sc_convert.linId(cmb);
+	}
+
 	/*! \brief Get the number of Internal ghost boxes for one processor
 	 *
 	 * \param id near processor list id (the id go from 0 to getNNProcessor())
diff --git a/src/Decomposition/ie_loc_ghost.hpp b/src/Decomposition/ie_loc_ghost.hpp
index 727b1dca1d063db2be6ff4b4b88d8e0405320493..64f50caee5cef345cd40717191f42c4be980a5c3 100755
--- a/src/Decomposition/ie_loc_ghost.hpp
+++ b/src/Decomposition/ie_loc_ghost.hpp
@@ -159,7 +159,7 @@ class ie_loc_ghost
 		// that must be adjusted, each of this boxes define a shift in case of periodic boundary condition
 		for (long int i = dim-1 ; i >= 0 ; i--)
 		{
-			std::vector<comb<dim>> cmbs = hyp.getCombinations_R(i);
+			std::vector<comb<dim>> cmbs = hyp.getCombinations_R_bc(i,bc);
 
 			for (size_t j = 0 ; j < cmbs.size() ; j++)
 			{
diff --git a/src/Decomposition/nn_processor.hpp b/src/Decomposition/nn_processor.hpp
index aac751e7f06f1e1d62fd1151e5a8fc6dc816c291..42e56002484ca5651f5987cec503e1e508ec35a7 100755
--- a/src/Decomposition/nn_processor.hpp
+++ b/src/Decomposition/nn_processor.hpp
@@ -178,7 +178,7 @@ class nn_prcs
 		// that must be adjusted, each of this boxes define a shift in case of periodic boundary condition
 		for (long int i = dim-1 ; i >= 0 ; i--)
 		{
-			std::vector<comb<dim>> cmbs = hyp.getCombinations_R(i);
+			std::vector<comb<dim>> cmbs = hyp.getCombinations_R_bc(i,bc);
 
 			for (size_t j = 0 ; j < cmbs.size() ; j++)
 			{
diff --git a/src/Decomposition/shift_vect_converter.hpp b/src/Decomposition/shift_vect_converter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d77d35d9a521847720caeec70ecdfa45e48169c1
--- /dev/null
+++ b/src/Decomposition/shift_vect_converter.hpp
@@ -0,0 +1,196 @@
+/*
+ * shift_vect_converter.hpp
+ *
+ *  Created on: Feb 8, 2018
+ *      Author: i-bird
+ */
+
+#ifndef SRC_DECOMPOSITION_SHIFT_VECT_CONVERTER_HPP_
+#define SRC_DECOMPOSITION_SHIFT_VECT_CONVERTER_HPP_
+
+#include "Space/Shape/HyperCube.hpp"
+
+/*! \brief in case of high dimensions shift vector converter
+ *
+ * In case of high-dimensions the number of shift vectors explode, this class
+ * handle such case
+ *
+ */
+template<unsigned int dim, typename T>
+class shift_vect_converter
+{
+	//! Indicate which indexes are non_periodic
+	size_t red_shift_v[dim];
+
+	// indexes
+	size_t tmp[dim];
+
+	// Dimension
+	int dim_r = 0;
+
+	/*! \brief Here we generare the shift vectors for the low dimension case
+	 *
+	 * \param domain box that describe the domain
+	 *
+	 */
+	void generateShiftVectors_ld(const Box<dim,T> & domain, size_t (& bc)[dim], openfpm::vector<Point<dim,T>> & shifts)
+	{
+		shifts.resize(openfpm::math::pow(3,dim));
+
+		HyperCube<dim> hyp;
+
+		for (long int i = dim-1 ; i >= 0 ; i--)
+		{
+			std::vector<comb<dim>> cmbs = hyp.getCombinations_R(i);
+
+			for (size_t j = 0 ; j < cmbs.size() ; j++)
+			{
+				for (size_t k = 0 ; k < dim ; k++)
+				{
+					switch (cmbs[j][k])
+					{
+					case 1:
+						shifts.get(cmbs[j].lin()).template get<0>()[k] = -(domain.getHigh(k) - domain.getLow(k));
+						break;
+					case 0:
+						shifts.get(cmbs[j].lin()).template get<0>()[k] = 0;
+						break;
+					case -1:
+						shifts.get(cmbs[j].lin()).template get<0>()[k] = (domain.getHigh(k) - domain.getLow(k));
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	/*! \brief Here we generare the shift vectors for the high dimension case
+	 *
+	 * \param domain box that describe the domain
+	 *
+	 */
+	void generateShiftVectors_hd(const Box<dim,T> & domain, size_t (& bc)[dim], openfpm::vector<Point<dim,T>> & shifts)
+	{
+		// get the indexes of the free degree of freedom
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (bc[i] == PERIODIC)
+			{
+				red_shift_v[dim_r] = i;
+				dim_r++;
+			}
+		}
+
+		HyperCube<dim> hyp;
+
+		// precalculate the nuber of shift vectors
+		size_t nsv = 0;
+		for (long int i = dim-1 ; i >= 0 ; i--)
+		{nsv += hyp.getCombinations_R_bc(i,bc).size();}
+		shifts.resize(nsv+1);
+
+		for (long int i = dim-1 ; i >= 0 ; i--)
+		{
+			std::vector<comb<dim>> cmbs = hyp.getCombinations_R_bc(i,bc);
+
+			for (size_t j = 0 ; j < cmbs.size() ; j++)
+			{
+				size_t lin_cmb = linId_hd(cmbs[j]);
+
+				for (size_t k = 0 ; k < dim ; k++)
+				{
+					switch (cmbs[j][k])
+					{
+					case 1:
+						shifts.get(lin_cmb).template get<0>()[k] = -(domain.getHigh(k) - domain.getLow(k));
+						break;
+					case 0:
+						shifts.get(lin_cmb).template get<0>()[k] = 0;
+						break;
+					case -1:
+						shifts.get(lin_cmb).template get<0>()[k] = (domain.getHigh(k) - domain.getLow(k));
+						break;
+					}
+				}
+			}
+		}
+	}
+
+public:
+
+	/*! \brief Here we generare the shift vectors for the low dimension case
+	 *
+	 * \param domain box that describe the domain
+	 *
+	 */
+	void generateShiftVectors(const Box<dim,T> & domain, size_t (& bc)[dim], openfpm::vector<Point<dim,T>> & shifts)
+	{
+		if (dim < 10)
+		{generateShiftVectors_ld(domain,bc,shifts);}
+		else
+		{generateShiftVectors_hd(domain,bc,shifts);}
+	}
+
+	/*! \brief Initialize
+	 *
+	 * \param bc boundary conditions
+	 *
+	 */
+	void Initialize(size_t (& bc)[dim])
+	{
+		// get the indexes of the free degree of freedom
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (bc[i] == PERIODIC)
+			{
+				red_shift_v[dim] = i;
+				dim_r++;
+			}
+		}
+	}
+
+	/*! \brief linearize the combination in case of high dimension
+	 *
+	 * \param cmb combination
+	 *
+	 */
+	size_t linId_hd(const comb<dim> & cmb)
+	{
+		size_t cul = 1;
+		size_t lin = 0;
+		for (long int i = 0 ; i < dim_r ; i++)
+		{
+			lin += cul*(cmb.c[red_shift_v[i]] + 1);
+			cul *= 3;
+		}
+
+		return lin;
+	}
+
+	/*! \brief linearize the combination in case of low dimensions
+	 *
+	 * \param cmb combination
+	 *
+	 */
+	inline size_t linId_ld(const comb<dim> & cmb)
+	{
+		return cmb.lin();
+	}
+
+	/*! \brief linearize the combination in case of high dimensions
+	 *
+	 * \param cmb combination
+	 *
+	 */
+	inline size_t linId(const comb<dim> & cmb)
+	{
+		if (dim < 10)
+		{return linId_ld(cmb);}
+
+		return linId_hd(cmb);
+	}
+
+};
+
+
+#endif /* SRC_DECOMPOSITION_SHIFT_VECT_CONVERTER_HPP_ */
diff --git a/src/Decomposition/CartDecomposition_unit_test.hpp b/src/Decomposition/tests/CartDecomposition_unit_test.cpp
similarity index 80%
rename from src/Decomposition/CartDecomposition_unit_test.hpp
rename to src/Decomposition/tests/CartDecomposition_unit_test.cpp
index 7e7026bff80b1fb198729b70b995fc3fd009e842..e66bbbf31c7d3727faa8b154e052bb1bf2547852 100755
--- a/src/Decomposition/CartDecomposition_unit_test.hpp
+++ b/src/Decomposition/tests/CartDecomposition_unit_test.cpp
@@ -1,7 +1,7 @@
-#ifndef CARTDECOMPOSITION_UNIT_TEST_HPP
-#define CARTDECOMPOSITION_UNIT_TEST_HPP
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
 
-#include "CartDecomposition.hpp"
+#include "Decomposition/CartDecomposition.hpp"
 #include "util/mathutil.hpp"
 
 BOOST_AUTO_TEST_SUITE (CartDecomposition_test)
@@ -414,6 +414,135 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_non_periodic_test_dist_grid)
 	BOOST_REQUIRE_EQUAL(val,true);
 }
 
+BOOST_AUTO_TEST_CASE( CartDecomposition_nsub_algo_functions_test)
+{
+	size_t n_sub = 64*2;
+	size_t div[3];
+
+	nsub_to_div2<3>(div,n_sub,3);
+
+	BOOST_REQUIRE_EQUAL(div[0],8ul);
+	BOOST_REQUIRE_EQUAL(div[1],8ul);
+	BOOST_REQUIRE_EQUAL(div[2],8ul);
+
+	nsub_to_div2<3>(div,n_sub,2);
+
+	BOOST_REQUIRE_EQUAL(div[0],16ul);
+	BOOST_REQUIRE_EQUAL(div[1],16ul);
+	BOOST_REQUIRE_EQUAL(div[2],1ul);
+
+	nsub_to_div2<3>(div,n_sub,1);
+
+	BOOST_REQUIRE_EQUAL(div[0],128ul);
+	BOOST_REQUIRE_EQUAL(div[1],1ul);
+	BOOST_REQUIRE_EQUAL(div[2],1ul);
+
+	n_sub = 64*3;
+	nsub_to_div<3>(div,n_sub,3);
+
+	BOOST_REQUIRE_EQUAL(div[0],5ul);
+	BOOST_REQUIRE_EQUAL(div[1],5ul);
+	BOOST_REQUIRE_EQUAL(div[2],5ul);
+
+	nsub_to_div<3>(div,n_sub,2);
+
+	BOOST_REQUIRE_EQUAL(div[0],13ul);
+	BOOST_REQUIRE_EQUAL(div[1],13ul);
+	BOOST_REQUIRE_EQUAL(div[2],1ul);
+
+	nsub_to_div<3>(div,n_sub,1);
+
+	BOOST_REQUIRE_EQUAL(div[0],192ul);
+	BOOST_REQUIRE_EQUAL(div[1],1ul);
+	BOOST_REQUIRE_EQUAL(div[2],1ul);
+
+	// Test high dimension cart decomposition subdivision
+
+	Box<50,double> domain;
+	size_t bc[50];
+	Ghost<50,double> ghost(0.01);
+
+	for(size_t i = 0 ; i < 50 ; i++)
+	{
+		domain.setLow(i,0.0);
+		domain.setHigh(i,1.0);
+		bc[i] = NON_PERIODIC;
+	}
+
+	CartDecomposition<50,double> dec(create_vcluster());
+
+	dec.setGoodParameters(domain,bc,ghost,64);
+
+	size_t div2[50];
+	dec.getParameters(div2);
+
+	auto & v_cl = create_vcluster();
+	if (v_cl.size() == 1)
+	{
+		for (size_t i = 0 ; i < 50 ; i++)
+		{
+			if (i < 6)
+			{BOOST_REQUIRE_EQUAL(div2[i],2ul);}
+			else
+			{BOOST_REQUIRE_EQUAL(div2[i],1ul);}
+		}
+	}
+
+	if (v_cl.size() == 2)
+	{
+		for (size_t i = 0 ; i < 50 ; i++)
+		{
+			if (i < 7)
+			{BOOST_REQUIRE_EQUAL(div2[i],2ul);}
+			else
+			{BOOST_REQUIRE_EQUAL(div2[i],1ul);}
+		}
+	}
+
+	if (v_cl.size() == 3)
+	{
+		for (size_t i = 0 ; i < 50 ; i++)
+		{
+			if (i < 2)
+			{BOOST_REQUIRE_EQUAL(div2[i],13ul);}
+			else
+			{BOOST_REQUIRE_EQUAL(div2[i],1ul);}
+		}
+	}
+
+	if (v_cl.size() == 4)
+	{
+		for (size_t i = 0 ; i < 50 ; i++)
+		{
+			if (i < 8)
+			{BOOST_REQUIRE_EQUAL(div2[i],2ul);}
+			else
+			{BOOST_REQUIRE_EQUAL(div2[i],1ul);}
+		}
+	}
+
+	if (v_cl.size() == 5)
+	{
+		for (size_t i = 0 ; i < 50 ; i++)
+		{
+			if (i < 8)
+			{BOOST_REQUIRE_EQUAL(div2[i],2ul);}
+			else
+			{BOOST_REQUIRE_EQUAL(div2[i],1ul);}
+		}
+	}
+
+	if (v_cl.size() == 6)
+	{
+		for (size_t i = 0 ; i < 50 ; i++)
+		{
+			if (i < 3)
+			{BOOST_REQUIRE_EQUAL(div2[i],7ul);}
+			else
+			{BOOST_REQUIRE_EQUAL(div2[i],1ul);}
+		}
+	}
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
-#endif
diff --git a/src/Decomposition/tests/shift_vect_converter_tests.cpp b/src/Decomposition/tests/shift_vect_converter_tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..260fc7289e04b2e28136cecc350f694ad9b622da
--- /dev/null
+++ b/src/Decomposition/tests/shift_vect_converter_tests.cpp
@@ -0,0 +1,123 @@
+/*
+ * shift_vect_converter_tests.cpp
+ *
+ *  Created on: Feb 8, 2018
+ *      Author: i-bird
+ */
+
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include "Space/Shape/Box.hpp"
+
+#include "Vector/map_vector.hpp"
+#include "Decomposition/shift_vect_converter.hpp"
+
+BOOST_AUTO_TEST_SUITE( shift_vect_converter_tests_suite )
+
+BOOST_AUTO_TEST_CASE( shift_vect_converter_tests_use )
+{
+	{
+	Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+	shift_vect_converter<3,double> svc;
+	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
+
+	openfpm::vector<Point<3,double>> sv;
+
+	svc.generateShiftVectors(domain,bc,sv);
+
+	BOOST_REQUIRE_EQUAL(sv.size(),27ul);
+
+	// We test that the cominations generate the correct shift vectors
+	comb<3> cmb1({-1,-1,1});
+	comb<3> cmb2({-1,0,1});
+	comb<3> cmb3({0,0,1});
+
+	size_t i = svc.linId(cmb1);
+
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[0],-1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[1],1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[2],1.0);
+
+	i = svc.linId(cmb2);
+
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[0],-1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[1],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[2],1.0);
+
+	i = svc.linId(cmb3);
+
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[0],-1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[1],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[2],0.0);
+
+	}
+
+	{
+	openfpm::vector<Point<50,double>> sv;
+	Box<50,double> domain;
+	size_t bc[50];
+
+	for (size_t i = 0 ; i < 50 ; i++)
+	{
+		domain.setLow(i,0.0);
+		domain.setHigh(i,1.0);
+		bc[i] = NON_PERIODIC;
+	}
+
+	bc[5] = PERIODIC;
+	bc[17] = PERIODIC;
+	bc[23] = PERIODIC;
+
+	shift_vect_converter<50,double> svc;
+
+	svc.generateShiftVectors(domain,bc,sv);
+
+	BOOST_REQUIRE_EQUAL(sv.size(),27ul);
+
+	// We test that the cominations generate the correct shift vectors
+	comb<50> cmb1;
+	comb<50> cmb2;
+	comb<50> cmb3;
+
+	cmb1.c[5] = 1;
+	cmb1.c[17] = -1;
+	cmb1.c[23] = -1;
+
+	cmb2.c[5] = 1;
+	cmb2.c[17] = 0;
+	cmb2.c[23] = -1;
+
+	cmb3.c[5] = 1;
+	cmb3.c[17] = 0;
+	cmb3.c[23] = 0;
+
+	size_t i = svc.linId(cmb1);
+
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[5],-1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[6],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[17],1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[23],1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[24],0.0);
+
+	i = svc.linId(cmb2);
+
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[5],-1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[6],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[17],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[23],1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[24],0.0);
+
+	i = svc.linId(cmb3);
+
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[5],-1.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[6],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[17],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[23],0.0);
+	BOOST_REQUIRE_EQUAL(sv.get<0>(i)[24],0.0);
+
+	}
+}
+
+
+BOOST_AUTO_TEST_SUITE_END()
+
diff --git a/src/Grid/Iterators/grid_dist_id_iterator.hpp b/src/Grid/Iterators/grid_dist_id_iterator.hpp
index 6cfc5f54a3b08bc9cf95d037c5a764920ec91ac9..60cb2e0c4d3573cc86a0ce95b8995a09225b93c8 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator.hpp
@@ -48,7 +48,7 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 	size_t g_c;
 
 	//! List of the grids we are going to iterate
-	const openfpm::vector<device_grid> & gList;
+	openfpm::vector<device_grid> & gList;
 
 	//! Extension of each grid: domain and ghost + domain
 	const openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext;
@@ -59,13 +59,17 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 	//! stop point (is the grid size)
 	grid_key_dx<dim> stop;
 
+	// device grid pointer
+	device_grid * dg;
+
 	/*! \brief from g_c increment g_c until you find a valid grid
 	 *
 	 */
 	void selectValidGrid()
 	{
 		// When the grid has size 0 potentially all the other informations are garbage
-		while (g_c < gList.size() && (gList.get(g_c).size() == 0 || gdb_ext.get(g_c).Dbox.isValid() == false ) ) g_c++;
+		while (g_c < gList.size() && (gList.get(g_c).size() == 0 || gdb_ext.get(g_c).Dbox.isValid() == false ) )
+		{g_c++;}
 
 		// get the next grid iterator
 		if (g_c < gList.size())
@@ -83,7 +87,7 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 	 * \param stop end point
 	 *
 	 */
-	grid_dist_iterator(const openfpm::vector<device_grid> & gk, const openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext, const grid_key_dx<dim> & stop)
+	grid_dist_iterator(openfpm::vector<device_grid> & gk, const openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext, const grid_key_dx<dim> & stop)
 	:g_c(0),gList(gk),gdb_ext(gdb_ext),stop(stop)
 	{
 		// Initialize the current iterator
@@ -100,7 +104,7 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 	 * \param stencil_pnt stencil points
 	 *
 	 */
-	grid_dist_iterator(const openfpm::vector<device_grid> & gk,
+	grid_dist_iterator(openfpm::vector<device_grid> & gk,
 			           const openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext,
 					   const grid_key_dx<dim> & stop,
 					   const grid_key_dx<dim> (& stencil_pnt)[stencil::nsp])
@@ -151,7 +155,7 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 		// If there are no other grid stop
 
 		if (g_c >= gList.size())
-			return false;
+		{return false;}
 
 		return true;
 	}
@@ -240,6 +244,18 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 	{
 		return grid_dist_lin_dx(g_c,a_it.template getStencil<id>());
 	}
+
+	/*! \brief Return the stencil point offset
+	 *
+	 * \tparam id
+	 *
+	 * \return linearized distributed key
+	 *
+	 */
+	template<unsigned int id> inline grid_dist_g_dx<device_grid> getStencilGrid()
+	{
+		return grid_dist_g_dx<device_grid>(dg,a_it.template getStencil<id>());
+	}
 };
 
 
diff --git a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
index a1730e5dcdcd263219752c93a7462af1f64547f7..1b2533880d8a588e57b39a534443dc1ee73718ca 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
@@ -249,6 +249,79 @@ class grid_dist_iterator_sub
 	{
 		return stop;
 	}
+
+	/*! \brief Return the number of local grids
+	 *
+	 *
+	 */
+	inline size_t N_loc_grid()
+	{
+		return gList.size();
+	}
+
+	/*! \brief Return the component j of the starting point (P1) of the domain part
+	 *         for the local grid i
+	 *
+	 * \param i local grid
+	 * \param j dimension
+	 *
+	 *
+	 */
+	inline size_t loc_grid_info_start(size_t i,size_t j)
+	{
+		return gdb_ext.get(i).DBox.getLow(i);
+	}
+
+	/*! \brief Return the component j of the stop point (P2) of the domain part
+	 *         for the local grid i
+	 *
+	 * \param i local grid
+	 * \param j dimension
+	 *
+	 *
+	 */
+	inline size_t loc_grid_info_size(size_t i,size_t j)
+	{
+		return gdb_ext.get(i).GDBox.getHigh(i);
+	}
 };
 
+
+//////// MACRO in 3D
+
+#define WHILE_M(grid,stencil) auto & ginfo = grid.getLocalGridsInfo();\
+								 for (size_t s = 0 ; s < grid.getN_loc_grid() ; s++)\
+								 {\
+									 auto it = grid.get_loc_grid_iterator_stencil(s,stencil);\
+\
+										int lo[3] = {(int)ginfo.get(s).Dbox.getLow(0),(int)ginfo.get(s).Dbox.getLow(1),(int)ginfo.get(s).Dbox.getLow(2)};\
+										int hi[3] = {(int)ginfo.get(s).Dbox.getHigh(0),(int)ginfo.get(s).Dbox.getHigh(1),(int)ginfo.get(s).Dbox.getHigh(2)};\
+\
+										int uhi[3] = {(int)ginfo.get(s).GDbox.getHigh(0),(int)ginfo.get(s).GDbox.getHigh(1),(int)ginfo.get(s).GDbox.getHigh(2)};\
+\
+										int sx = uhi[0]+1;\
+										int sxsy = (uhi[0]+1)*(uhi[1]+1);
+
+#define ITERATE_3D_M(n_pt)			int i = lo[2];\
+									for ( ; i <= hi[2] ; i+=1)\
+									{\
+										int j = lo[1];\
+										for ( ; j <= hi[1] ; j+=1)\
+										{\
+											int k = lo[0];\
+											for ( ; k <= hi[0] ; k+=n_pt)\
+											{
+
+
+#define GET_GRID_M(grid)	grid.get_loc_grid(s);
+
+
+#define END_LOOP_M(n_pt) 					it.private_sum<n_pt>();\
+								}\
+								it.private_adjust( - k + sx + lo[0]);\
+							}\
+							it.private_adjust(- j*sx + sxsy + lo[1]*sx);\
+						}\
+					}
+
 #endif /* SRC_GRID_GRID_DIST_ID_ITERATOR_SUB_HPP_ */
diff --git a/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp b/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
index bab53f28483e171d92533524b29b1e1d00c84501..0c62f40e589ca8ca8555a16be9d72ff83ffc4440 100644
--- a/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
@@ -46,7 +46,7 @@ void Test2D_sub(const Box<2,float> & domain, long int k)
 		Ghost<2,float> g(0.01 / factor);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<2, float, scalar<float>> g_dist(sz,domain,g);
+		grid_dist_id<2, float, aggregate<float>> g_dist(sz,domain,g);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -320,7 +320,7 @@ void Test3D_stencil(const Box<3,float> & domain, long int k)
 			Ghost<3,long int> g(1);
 
 			// Distributed grid with id decomposition
-			grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+			grid_dist_id<3, float, aggregate<long int, long int, double>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 			// fill the grid with values
 
@@ -382,6 +382,97 @@ void Test3D_stencil(const Box<3,float> & domain, long int k)
 	}
 }
 
+void Test3D_fast_vect(const Box<3,float> & domain, long int k)
+{
+	grid_key_dx<3> star_stencil_3D[7] = {{0,0,0},
+	                                         {0,0,-1},
+											 {0,0,1},
+											 {0,-1,0},
+											 {0,1,0},
+											 {-1,0,0},
+											 {1,0,0}};
+
+	{
+		Vcluster & v_cl = create_vcluster();
+
+		if ( v_cl.getProcessingUnits() > 32 )
+			return;
+
+		long int big_step = k / 30;
+		big_step = (big_step == 0)?1:big_step;
+		long int small_step = 21;
+
+		print_test( "Testing grid 3D fast stencil k<=",k);
+
+		// 3D test
+		for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
+		{
+			BOOST_TEST_CHECKPOINT( "Testing grid skin iterator from decomposition k<=" << k );
+
+			// grid size
+			size_t sz[3];
+			sz[0] = k;
+			sz[1] = k;
+			sz[2] = k;
+
+			if (k <= 9)
+				continue;
+
+			Ghost<3,long int> g(1);
+
+			// Distributed grid with id decomposition
+			grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+
+			// fill the grid with values
+
+			auto it = g_dist.getDomainGhostIterator();
+
+			while (it.isNext())
+			{
+				auto p = it.get();
+				auto gkey = it.getGKey(p);
+
+				g_dist.template get<0>(p) = gkey.get(0)*gkey.get(0) + gkey.get(1)*gkey.get(1) + gkey.get(2)*gkey.get(2);
+
+				++it;
+			}
+
+			g_dist.ghost_get<0>();
+
+			size_t ret = true;
+
+			WHILE_M(g_dist,star_stencil_3D)
+				auto & gstl = GET_GRID_M(g_dist);
+			ITERATE_3D_M(1)
+				// center point
+				auto Cp = it.getStencil<0>();
+
+				// plus,minus X,Y,Z
+				auto mx = it.getStencil<1>();
+				auto px = it.getStencil<2>();
+				auto my = it.getStencil<3>();
+				auto py = it.getStencil<4>();
+				auto mz = it.getStencil<5>();
+				auto pz = it.getStencil<6>();
+
+				long int sum = -6*gstl.template get<0>(Cp) +
+						     gstl.template get<0>(mx) +
+							 gstl.template get<0>(px) +
+							 gstl.template get<0>(my) +
+							 gstl.template get<0>(py) +
+							 gstl.template get<0>(mz) +
+							 gstl.template get<0>(pz);
+
+				ret &= (sum == 6);
+
+			END_LOOP_M(1)
+
+			BOOST_REQUIRE_EQUAL(ret,true);
+		}
+
+	}
+}
+
 // Test decomposition grid iterator
 
 void Test3D_decskinit(const Box<3,float> & domain, long int k)
@@ -517,6 +608,16 @@ BOOST_AUTO_TEST_CASE( grid_dist_it_iterators_skin_test )
 	Test3D_decskinit(domain3,k);
 }
 
+BOOST_AUTO_TEST_CASE( grid_dist_it_iterators_3D_fast )
+{
+	// Domain
+	Box<3,float> domain3({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	size_t k = 128*128*128*create_vcluster().getProcessingUnits();
+	k = std::pow(k, 1/3.);
+	Test3D_fast_vect(domain3,k);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif /* SRC_GRID_ITERATORS_GRID_DIST_ID_ITERATORS_UNIT_TESTS_HPP_ */
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 49cb6b5866a30a328df89163530d324dac12db2d..fab6c7e6d30c14bc0de2f34144a1137bb96c43d2 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -74,6 +74,9 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 	//! Ghost expansion
 	Ghost<dim,St> ghost;
 
+	//! Ghost expansion
+	Ghost<dim,long int> ghost_int;
+
 	//! Local grids
 	mutable openfpm::vector<device_grid> loc_grid;
 
@@ -192,6 +195,53 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 		return flp;
 	}
 
+	/*! \brief this function is for optimization of the ghost size
+	 *
+	 * Because the decomposition work in continuum and discrete ghost is
+	 *  converted in continuum, in some case continuum ghost because of
+	 *  rounding-off error can produce ghost bigger than the discrete selected
+	 *   one. This function adjust for this round-off error
+	 *
+	 * \param sub_domain the sub-domain
+	 * \param sub_domain_other the other sub-domain
+	 * \param ib internal ghost box to adjust
+	 *
+	 */
+	void set_for_adjustment(const Box<dim,long int> & sub_domain,
+							const Box<dim,St> & sub_domain_other,
+							const comb<dim> & cmb,
+							Box<dim,long int> & ib,
+							Ghost<dim,long int> & g)
+	{
+		if (g.isInvalidGhost() == true)
+		{return;}
+
+		// Convert from SpaceBox<dim,St> to SpaceBox<dim,long int>
+		Box<dim,long int> sub_domain_other_exp = cd_sm.convertDomainSpaceIntoGridUnits(sub_domain_other,dec.periodicity());
+
+		// translate sub_domain_other based on cmb
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (cmb.c[i] == 1)
+			{
+				sub_domain_other_exp.setLow(i,sub_domain_other_exp.getLow(i) - ginfo.size(i));
+				sub_domain_other_exp.setHigh(i,sub_domain_other_exp.getHigh(i) - ginfo.size(i));
+			}
+			else if (cmb.c[i] == -1)
+			{
+				sub_domain_other_exp.setLow(i,sub_domain_other_exp.getLow(i) + ginfo.size(i));
+				sub_domain_other_exp.setHigh(i,sub_domain_other_exp.getHigh(i) + ginfo.size(i));
+			}
+		}
+
+		sub_domain_other_exp.enlarge(g);
+		if (sub_domain_other_exp.Intersect(sub_domain,ib) == false)
+		{
+			for (size_t i = 0 ; i < dim ; i++)
+			{ib.setHigh(i,ib.getLow(i) - 1);}
+		}
+	}
+
 	/*! \brief Create per-processor internal ghost boxes list in grid units and g_id_to_external_ghost_box
 	 *
 	 */
@@ -221,6 +271,21 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 				if (ib.isValid() == false)
 					continue;
 
+				size_t sub_id = dec.getProcessorIGhostSub(i,j);
+				size_t r_sub = dec.getProcessorIGhostSSub(i,j);
+
+				auto & n_box = dec.getNearSubdomains(dec.IDtoProc(i));
+
+				Box<dim,long int> sub = gdb_ext.get(sub_id).Dbox;
+				sub += gdb_ext.get(sub_id).origin;
+
+				set_for_adjustment(sub,
+						           n_box.get(r_sub),dec.getProcessorIGhostPos(i,j),
+								   ib,ghost_int);
+
+				if (ib.isValid() == false)
+					continue;
+
 				// save the box and the sub-domain id (it is calculated as the linearization of P1)
 				::Box<dim,size_t> cvt = ib;
 
@@ -337,6 +402,19 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 				::Box<dim,St> ib_dom = dec.getLocalIGhostBox(i,j);
 				::Box<dim,long int> ib = cd_sm.convertDomainSpaceIntoGridUnits(ib_dom,dec.periodicity());
 
+				// Check if ib is valid if not it mean that the internal ghost does not contain information so skip it
+				if (ib.isValid() == false)
+					continue;
+
+				size_t sub_id = i;
+				size_t r_sub = dec.getLocalIGhostSub(i,j);
+
+				Box<dim,long int> sub = gdb_ext.get(sub_id).Dbox;
+				sub += gdb_ext.get(sub_id).origin;
+
+				set_for_adjustment(sub,dec.getSubDomain(r_sub),
+						           dec.getLocalIGhostPos(i,j),ib,ghost_int);
+
 				// Check if ib is valid if not it mean that the internal ghost does not contain information so skip it
 				if (ib.isValid() == false)
 					continue;
@@ -524,18 +602,6 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi
 
 protected:
 
-	/*! \brief Get the point where it start the origin of the grid of the sub-domain i
-	 *
-	 * \param i sub-domain
-	 *
-	 * \return the point
-	 *
-	 */
-	Point<dim,St> getOffset(size_t i)
-	{
-		return pmul(Point<dim,St>(gdb_ext.get(i).origin), cd_sm.getCellBox().getP2());
-	}
-
 	/*! \brief Given a local sub-domain i with a local grid Domain + ghost return the part of the local grid that is domain
 	 *
 	 * \param i sub-domain
@@ -609,6 +675,18 @@ public:
 		return domain;
 	}
 
+	/*! \brief Get the point where it start the origin of the grid of the sub-domain i
+	 *
+	 * \param i sub-domain
+	 *
+	 * \return the point
+	 *
+	 */
+	Point<dim,St> getOffset(size_t i)
+	{
+		return pmul(Point<dim,St>(gdb_ext.get(i).origin), cd_sm.getCellBox().getP2()) + getDomain().getP1();
+	}
+
     /*! \brief Get the spacing of the grid in direction i
      *
      * \param i dimension
@@ -653,8 +731,11 @@ public:
 	 * \param ext extension of the grid (must be positive on every direction)
 	 *
 	 */
-	template<typename H> grid_dist_id(const grid_dist_id<dim,St,H,typename Decomposition::base_type,Memory,grid_cpu<dim,H>> & g, const Ghost<dim,long int> & gh, Box<dim,size_t> ext)
-	:dec(create_vcluster()),v_cl(create_vcluster())
+	template<typename H>
+	grid_dist_id(const grid_dist_id<dim,St,H,typename Decomposition::base_type,Memory,grid_cpu<dim,H>> & g,
+			     const Ghost<dim,long int> & gh,
+				 Box<dim,size_t> ext)
+	:ghost_int(gh),dec(create_vcluster()),v_cl(create_vcluster())
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -701,8 +782,11 @@ public:
      * \param ghost Ghost part
      *
      */
-    grid_dist_id(const Decomposition & dec, const size_t (& g_sz)[dim], const Ghost<dim,St> & ghost)
-    :domain(dec.getDomain()),ghost(ghost),dec(dec),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz)
+    grid_dist_id(const Decomposition & dec,
+    		     const size_t (& g_sz)[dim],
+				 const Ghost<dim,St> & ghost)
+    :domain(dec.getDomain()),ghost(ghost),ghost_int(INVALID_GHOST),dec(dec),v_cl(create_vcluster()),
+	 ginfo(g_sz),ginfo_v(g_sz)
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -719,8 +803,10 @@ public:
      * \param ghost Ghost part
      *
      */
-    grid_dist_id(Decomposition && dec, const size_t (& g_sz)[dim], const Ghost<dim,St> & ghost)
-    :domain(dec.getDomain()),ghost(ghost),dec(dec),ginfo(g_sz),ginfo_v(g_sz),v_cl(create_vcluster())
+    grid_dist_id(Decomposition && dec, const size_t (& g_sz)[dim],
+    		     const Ghost<dim,St> & ghost)
+    :domain(dec.getDomain()),ghost(ghost),dec(dec),ginfo(g_sz),
+	 ginfo_v(g_sz),v_cl(create_vcluster()),ghost_int(INVALID_GHOST)
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -739,8 +825,10 @@ public:
      * \warning In very rare case the ghost part can be one point bigger than the one specified
      *
      */
-	grid_dist_id(const Decomposition & dec, const size_t (& g_sz)[dim], const Ghost<dim,long int> & g)
-	:domain(dec.getDomain()),dec(create_vcluster()),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz)
+	grid_dist_id(const Decomposition & dec, const size_t (& g_sz)[dim],
+			     const Ghost<dim,long int> & g)
+	:domain(dec.getDomain()),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()),
+	 ginfo(g_sz),ginfo_v(g_sz)
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -764,8 +852,10 @@ public:
      * \warning In very rare case the ghost part can be one point bigger than the one specified
      *
      */
-	grid_dist_id(Decomposition && dec, const size_t (& g_sz)[dim], const Ghost<dim,long int> & g)
-	:domain(dec.getDomain()),dec(dec),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz)
+	grid_dist_id(Decomposition && dec, const size_t (& g_sz)[dim],
+			     const Ghost<dim,long int> & g)
+	:domain(dec.getDomain()),dec(dec),v_cl(create_vcluster()),ginfo(g_sz),
+	 ginfo_v(g_sz),ghost_int(g)
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -787,7 +877,8 @@ public:
      * \warning In very rare case the ghost part can be one point bigger than the one specified
      *
      */
-	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,St> & g)
+	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain,
+			     const Ghost<dim,St> & g)
 	:grid_dist_id(g_sz,domain,g,create_non_periodic<dim>())
 	{
 	}
@@ -816,8 +907,10 @@ public:
      * \warning In very rare case the ghost part can be one point bigger than the one specified
      *
      */
-	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,St> & g, const periodicity<dim> & p)
-	:domain(domain),ghost(g),dec(create_vcluster()),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz)
+	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain,
+			     const Ghost<dim,St> & g, const periodicity<dim> & p)
+	:domain(domain),ghost(g),ghost_int(INVALID_GHOST),dec(create_vcluster()),v_cl(create_vcluster()),
+	 ginfo(g_sz),ginfo_v(g_sz)
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -838,8 +931,10 @@ public:
      * \warning In very rare case the ghost part can be one point bigger than the one specified
      *
      */
-	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,long int> & g, const periodicity<dim> & p)
-	:domain(domain),dec(create_vcluster()),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz)
+	grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain,
+			     const Ghost<dim,long int> & g, const periodicity<dim> & p)
+	:domain(domain),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()),ginfo(g_sz),
+	 ginfo_v(g_sz)
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,GRID_DIST_EVENT,4);
@@ -1199,7 +1294,7 @@ public:
 	 * \return the selected element
 	 *
 	 */
-	template <unsigned int p>inline auto get(const grid_dist_key_dx<dim> & v1) const -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	template <unsigned int p = 0>inline auto get(const grid_dist_key_dx<dim> & v1) const -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -1215,7 +1310,7 @@ public:
 	 * \return the selected element
 	 *
 	 */
-	template <unsigned int p>inline auto get(const grid_dist_key_dx<dim> & v1) -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	template <unsigned int p = 0>inline auto get(const grid_dist_key_dx<dim> & v1) -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -1231,7 +1326,39 @@ public:
 	 * \return the selected element
 	 *
 	 */
-	template <unsigned int p>inline auto get(const grid_dist_lin_dx & v1) const -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	template <unsigned int p = 0>inline auto get(grid_dist_g_dx<device_grid> & v1) const -> typename std::add_lvalue_reference<decltype(v1.getSub()->template get<p>(v1.getKey()))>::type
+	{
+#ifdef SE_CLASS2
+		check_valid(this,8);
+#endif
+		return v1.getSub()->template get<p>(v1.getKey());
+	}
+
+	/*! \brief Get the reference of the selected element
+	 *
+	 * \tparam p property to get (is an integer)
+	 * \param v1 grid_key that identify the element in the grid
+	 *
+	 * \return the selected element
+	 *
+	 */
+	template <unsigned int p = 0>inline auto get(grid_dist_g_dx<device_grid> & v1) -> typename std::add_lvalue_reference<decltype(v1.getSub()->template get<p>(v1.getKey()))>::type
+	{
+#ifdef SE_CLASS2
+		check_valid(this,8);
+#endif
+		return v1.getSub()->template get<p>(v1.getKey());
+	}
+
+	/*! \brief Get the reference of the selected element
+	 *
+	 * \tparam p property to get (is an integer)
+	 * \param v1 grid_key that identify the element in the grid
+	 *
+	 * \return the selected element
+	 *
+	 */
+	template <unsigned int p = 0>inline auto get(const grid_dist_lin_dx & v1) const -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -1247,7 +1374,7 @@ public:
 	 * \return the selected element
 	 *
 	 */
-	template <unsigned int p>inline auto get(const grid_dist_lin_dx & v1) -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
+	template <unsigned int p = 0>inline auto get(const grid_dist_lin_dx & v1) -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type
 	{
 #ifdef SE_CLASS2
 		check_valid(this,8);
@@ -1263,7 +1390,7 @@ public:
 	 * \return the selected element
 	 *
 	 */
-	template <unsigned int p>inline auto getProp(const grid_dist_key_dx<dim> & v1) const -> decltype(this->template get<p>(v1))
+	template <unsigned int p = 0>inline auto getProp(const grid_dist_key_dx<dim> & v1) const -> decltype(this->template get<p>(v1))
 	{
 		return this->template get<p>(v1);
 	}
@@ -1276,7 +1403,7 @@ public:
 	 * \return the selected element
 	 *
 	 */
-	template <unsigned int p>inline auto getProp(const grid_dist_key_dx<dim> & v1) -> decltype(this->template get<p>(v1))
+	template <unsigned int p = 0>inline auto getProp(const grid_dist_key_dx<dim> & v1) -> decltype(this->template get<p>(v1))
 	{
 		return this->template get<p>(v1);
 	}
@@ -1366,6 +1493,7 @@ public:
 																						  	  	  	 g_id_to_internal_ghost_box);
 	}
 
+
 	/*! \brief Copy the give grid into this grid
 	 *
 	 * It copy the first grid into the given grid (No ghost)
@@ -1373,21 +1501,52 @@ public:
 	 * \warning the Decomposition must be ensured to be the same, otherwise crashes can happen, if you want to copy the grid independently from the decomposition please use the operator equal
 	 *
 	 * \param g Grid to copy
+	 * \param use_memcpy use memcpy function if possible
 	 *
 	 * \return itself
 	 *
 	 */
-	grid_dist_id<dim,St,T,Decomposition,Memory,device_grid> & copy(grid_dist_id<dim,St,T,Decomposition,Memory,device_grid> & g)
+	grid_dist_id<dim,St,T,Decomposition,Memory,device_grid> & copy(grid_dist_id<dim,St,T,Decomposition,Memory,device_grid> & g, bool use_memcpy = true)
 	{
-		auto it = this->getDomainIterator();
+		if (T::noPointers() == true && use_memcpy)
+		{
+			for (size_t i = 0 ; i < this->getN_loc_grid() ; i++)
+			{
+				auto & gs_src = this->get_loc_grid(i).getGrid();
+
+				long int start = gs_src.LinId(gdb_ext.get(i).Dbox.getKP1());
+				long int stop = gs_src.LinId(gdb_ext.get(i).Dbox.getKP2());
+
+				if (stop < start) {continue;}
+
+				void * dst = static_cast<void *>(static_cast<char *>(this->get_loc_grid(i).getPointer()) + start*sizeof(T));
+				void * src = static_cast<void *>(static_cast<char *>(g.get_loc_grid(i).getPointer()) + start*sizeof(T));
 
-		while (it.isNext())
+				memcpy(dst,src,sizeof(T) * (stop + 1 - start));
+			}
+		}
+		else
 		{
-			auto key = it.get();
+			grid_key_dx<dim> cnt[1];
+			cnt[0].zero();
+
+			for (size_t i = 0 ; i < this->getN_loc_grid() ; i++)
+			{
+				auto & dst = this->get_loc_grid(i);
+				auto & src = g.get_loc_grid(i);
+
+				auto it = this->get_loc_grid_iterator_stencil(i,cnt);
 
-			this->loc_grid.get(key.getSub()).get_o(key.getKey()) = g.loc_grid.get(key.getSub()).get_o(key.getKey());
+				while (it.isNext())
+				{
+					// center point
+					auto Cp = it.template getStencil<0>();
 
-			++it;
+					dst.get_o(Cp) = src.get_o(Cp);
+
+					++it;
+				}
+			}
 		}
 
 		return *this;
@@ -1510,6 +1669,36 @@ public:
 		return loc_grid.get(i);
 	}
 
+	/*! \brief Get the i sub-domain grid
+	 *
+	 * \param i sub-domain
+	 *
+	 * \return local grid
+	 *
+	 */
+	grid_key_dx_iterator_sub<dim,no_stencil> get_loc_grid_iterator(size_t i)
+	{
+		return grid_key_dx_iterator_sub<dim,no_stencil>(loc_grid.get(i).getGrid(),
+				 gdb_ext.get(i).Dbox.getKP1(),
+				 gdb_ext.get(i).Dbox.getKP2());
+	}
+
+	/*! \brief Get the i sub-domain grid
+	 *
+	 * \param i sub-domain
+	 *
+	 * \return local grid
+	 *
+	 */
+	template<unsigned int Np>
+	grid_key_dx_iterator_sub<dim,stencil_offset_compute<dim,Np>> get_loc_grid_iterator_stencil(size_t i,const grid_key_dx<dim> (& stencil_pnt)[Np])
+	{
+		return grid_key_dx_iterator_sub<dim,stencil_offset_compute<dim,Np>>(loc_grid.get(i).getGrid(),
+													 gdb_ext.get(i).Dbox.getKP1(),
+													 gdb_ext.get(i).Dbox.getKP2(),
+													 stencil_pnt);
+	}
+
 	/*! \brief Return the number of local grid
 	 *
 	 * \return the number of local grid
@@ -1614,6 +1803,26 @@ public:
 		map();
 	}
 
+	/*! \brief Get the internal local ghost box
+	 *
+	 * \return the internal local ghost box
+	 *
+	 */
+	const openfpm::vector<i_lbox_grid<dim>> & get_loc_ig_box()
+	{
+		return this->loc_ig_box;
+	}
+
+	/*! \brief Get the internal ghost box
+	 *
+	 * \return the internal local ghost box
+	 *
+	 */
+	const openfpm::vector<i_lbox_grid<dim>> & get_ig_box()
+	{
+		return this->ig_box;
+	}
+
 	//! Define friend classes
 	//\cond
 	friend grid_dist_id<dim,St,T,typename Decomposition::extended_type,Memory,device_grid>;
diff --git a/src/Grid/grid_dist_id_HDF5_chckpnt_restart_test.hpp b/src/Grid/grid_dist_id_HDF5_chckpnt_restart_test.hpp
index fd63918ace57d8f4a9f4f65bda40651e83fec643..75a1878eae3cff13d368cedbc7b4797236c7e62a 100644
--- a/src/Grid/grid_dist_id_HDF5_chckpnt_restart_test.hpp
+++ b/src/Grid/grid_dist_id_HDF5_chckpnt_restart_test.hpp
@@ -38,7 +38,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_save_test )
 	Ghost<2,float> g(ghost_part);
 
 	// Distributed grid with id decomposition
-	grid_dist_id<2, float, scalar<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
+	grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
 
 	// get the decomposition
 	auto & dec = g_dist.getDecomposition();
@@ -71,12 +71,12 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_save_test )
 	size_t sum = 0;
 
 	for (size_t i = 0; i < count_total.size(); i++)
-		sum += count_total.get(i);
+	{sum += count_total.get(i);}
 
 	timer t;
 	t.start();
 	// Save the grid
-    g_dist.save("grid_dist_id.h5");
+    g_dist.save("grid_dist_id.h5" + std::to_string(v_cl.getProcessingUnits()));
 	t.stop();
 }
 
@@ -106,24 +106,15 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test )
 	Ghost<2,float> g(ghost_part);
 
 	// Distributed grid with id decomposition
-	grid_dist_id<2, float, scalar<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
-
-	g_dist.getDecomposition().write("Before_load_grid_decomposition");
-	g_dist.write("Before_Loaded_grid");
-
-	timer t;
-	t.start();
-	// Save the grid
-	g_dist.load("grid_dist_id.h5");
-	t.stop();
+	grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
 
-	g_dist.write("Loaded_grid");
-	g_dist.getDecomposition().write("Loaded_grid_decomposition");
+	g_dist.load("grid_dist_id.h5" + std::to_string(v_cl.getProcessingUnits()));
 
 	auto it = g_dist.getDomainIterator();
 
 	size_t count = 0;
 
+	bool match = true;
 	while (it.isNext())
 	{
 		//key
@@ -134,7 +125,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test )
 
 		auto keyg = g_dist.getGKey(key);
 
-		BOOST_REQUIRE_EQUAL(g_dist.template get<0>(key), keyg.get(0));
+		match &= g_dist.template get<0>(key) == keyg.get(0);
 
 		++it;
 		count++;
@@ -150,6 +141,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test )
 		sum += count_total.get(i);
 
 	BOOST_REQUIRE_EQUAL(sum, (size_t)k*k);
+	BOOST_REQUIRE_EQUAL(match,true);
 }
 
 
diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp
index b41d03de4c39d92d105726aab9c122c203b3199b..2b894424811880276482f5aefdd99f3fd4643185 100644
--- a/src/Grid/grid_dist_id_comm.hpp
+++ b/src/Grid/grid_dist_id_comm.hpp
@@ -9,7 +9,7 @@
 #define SRC_GRID_GRID_DIST_ID_COMM_HPP_
 
 #include "Vector/vector_dist_ofb.hpp"
-#include "data_type/scalar.hpp"
+#include "Grid/copy_grid_fast.hpp"
 
 /*! \brief Unpack selector
  *
@@ -88,7 +88,12 @@ struct grid_unpack_selector_with_prp<true,T,device_grid,Memory>
 
 		while (sub2.isNext())
 		{
-			object_s_di_op<op,decltype(gs.get_o(it_src.get())),decltype(gd.get_o(sub2.get())),OBJ_ENCAP,prp...>(gs.get_o(it_src.get()),gd.get_o(sub2.get()));
+			object_s_di_op<op,
+			            decltype(gs.get_o(it_src.get())),
+						decltype(gd.get_o(sub2.get())),
+						OBJ_ENCAP,prp...>
+			(gs.get_o(it_src.get()),
+			 gd.get_o(sub2.get()));
 
 			++sub2;
 			++it_src;
@@ -213,6 +218,9 @@ class grid_dist_id_comm
 											  openfpm::vector<device_grid> & loc_grid,
 											  std::unordered_map<size_t,size_t> & g_id_to_external_ghost_box)
 	{
+		grid_key_dx<dim> cnt[1];
+		cnt[0].zero();
+
 		//! For all the sub-domains
 		for (size_t i = 0 ; i < loc_ig_box.size() ; i++)
 		{
@@ -239,30 +247,39 @@ class grid_dist_id_comm
 				if (bx_dst.isValid() == false)
 					continue;
 
-				grid_key_dx_iterator_sub<dim> sub_src(loc_grid.get(i).getGrid(),bx_src.getKP1(),bx_src.getKP2());
-				grid_key_dx_iterator_sub<dim> sub_dst(loc_grid.get(sub_id_dst).getGrid(),bx_dst.getKP1(),bx_dst.getKP2());
+				const auto & gs = loc_grid.get(i);
+				auto & gd = loc_grid.get(sub_id_dst);
 
 #ifdef SE_CLASS1
 
 				if (loc_eg_box.get(sub_id_dst).bid.get(k).sub != i)
-					std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination are not correctly linked" << "\n";
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination are not correctly linked" << "\n";}
 
-				if (sub_src.getVolume() != sub_dst.getVolume())
-					std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination does not match in size" << "\n";
+				if (bx_src.getVolumeKey() != bx_dst.getVolumeKey())
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination does not match in size" << "\n";}
 
-#endif
+				auto bxs = gs.getGrid().getBoxKey();
+				auto bxd = gd.getGrid().getBoxKey();
 
-				const auto & gs = loc_grid.get(i);
-				auto & gd = loc_grid.get(sub_id_dst);
+				if (bxs.isContained(bx_src) == false)
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " the source box is out of bound of the local grid" << "\n";}
 
-				while (sub_src.isNext())
-				{
-					// Option 1
-					gd.set(sub_dst.get(),gs,sub_src.get());
+				if (bxd.isContained(bx_dst) == false)
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " the destination box is out of bound of the local grid" << "\n";}
 
-					++sub_src;
-					++sub_dst;
-				}
+#endif
+
+				typedef typename std::remove_reference<decltype(gd)>::type grid_cp;
+				typedef typename std::remove_reference<decltype(loc_grid.get(i).getGrid())>::type grid_info_cp;
+
+				copy_grid_fast<!is_contiguos<prp...>::type::value || has_pack_gen<typename device_grid::value_type>::value,
+							   dim,
+							   grid_cp,
+							   grid_info_cp>::copy(loc_grid.get(i).getGrid(),
+						       loc_grid.get(sub_id_dst).getGrid(),
+							   bx_src,
+							   bx_dst,
+							   gs,gd,cnt);
 			}
 		}
 	}
@@ -341,6 +358,187 @@ class grid_dist_id_comm
 		}
 	}
 
+	/*! \brief this function create send and receive asynchronously to receive ghosts part
+	 *
+	 * \param ig_box internal ghost box
+	 * \param eg_box external ghost box
+	 *
+	 */
+	template<int... prp>
+	void send_and_receive_ghost(ExtPreAlloc<Memory> ** prAlloc_prp,
+								ExtPreAlloc<Memory> ** prRecv_prp,
+								const openfpm::vector<ip_box_grid<dim>> & ig_box,
+								const openfpm::vector<ep_box_grid<dim>> & eg_box,
+								const openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext,
+								openfpm::vector<device_grid> & loc_grid,
+								size_t & req)
+	{
+		// Sending property object
+		typedef object<typename object_creator<typename T::type,prp...>::type> prp_object;
+
+		// Create a packing request vector
+		for ( size_t i = 0 ; i < ig_box.size() ; i++ )
+		{
+			// for each ghost box
+			for (size_t j = 0 ; j < ig_box.get(i).bid.size() ; j++)
+			{
+				// And linked sub-domain
+				size_t sub_id = ig_box.get(i).bid.get(j).sub;
+				// Internal ghost box
+				Box<dim,long int> g_ig_box = ig_box.get(i).bid.get(j).box;
+
+				if (g_ig_box.isValid() == false)
+					continue;
+
+				g_ig_box -= gdb_ext.get(sub_id).origin.template convertPoint<size_t>();
+
+				// Pack a size_t for the internal ghost id
+				Packer<size_t,HeapMemory>::packRequest(req);
+
+				// Create a sub grid iterator spanning the internal ghost layer
+
+				grid_key_dx_iterator_sub<dim> sub_it(loc_grid.get(sub_id).getGrid(),g_ig_box.getKP1(),g_ig_box.getKP2());
+				// and pack the internal ghost grid
+				Packer<device_grid,HeapMemory>::template packRequest<prp...>(loc_grid.get(sub_id),sub_it,req);
+			}
+		}
+
+		// resize the property buffer memory
+		g_send_prp_mem.resize(req);
+
+		// Create an object of preallocated memory for properties
+		(*prAlloc_prp) = new ExtPreAlloc<Memory>(req,g_send_prp_mem);
+		(*prAlloc_prp)->incRef();
+
+		// Pack information
+		Pack_stat sts;
+
+		// Pack the information for each processor and send it
+		for ( size_t i = 0 ; i < ig_box.size() ; i++ )
+		{
+
+			sts.mark();
+			void * pointer = (*prAlloc_prp)->getPointerEnd();
+
+			// for each ghost box
+			for (size_t j = 0 ; j < ig_box.get(i).bid.size() ; j++)
+			{
+				// we pack only if it is valid
+				if (ig_box.get(i).bid.get(j).box.isValid() == false)
+					continue;
+
+				// And linked sub-domain
+				size_t sub_id = ig_box.get(i).bid.get(j).sub;
+				// Internal ghost box
+				Box<dim,size_t> g_ig_box = ig_box.get(i).bid.get(j).box;
+				g_ig_box -= gdb_ext.get(sub_id).origin.template convertPoint<size_t>();
+				// Ghost box global id
+				size_t g_id = ig_box.get(i).bid.get(j).g_id;
+
+				// Pack a size_t for the internal ghost id
+				Packer<size_t,HeapMemory>::pack(**prAlloc_prp,g_id,sts);
+
+				// Create a sub grid iterator spanning the internal ghost layer
+				grid_key_dx_iterator_sub<dim> sub_it(loc_grid.get(sub_id).getGrid(),g_ig_box.getKP1(),g_ig_box.getKP2());
+				// and pack the internal ghost grid
+				Packer<device_grid,HeapMemory>::template pack<prp...>(**prAlloc_prp,loc_grid.get(sub_id),sub_it,sts);
+			}
+			// send the request
+
+			void * pointer2 = (*prAlloc_prp)->getPointerEnd();
+
+			v_cl.send(ig_box.get(i).prc,0,pointer,(char *)pointer2 - (char *)pointer);
+		}
+
+		// Calculate the total information to receive from each processors
+		std::vector<size_t> prp_recv;
+
+		//! Receive the information from each processors
+		for ( size_t i = 0 ; i < eg_box.size() ; i++ )
+		{
+			prp_recv.push_back(0);
+
+			// for each external ghost box
+			for (size_t j = 0 ; j < eg_box.get(i).bid.size() ; j++)
+			{
+				// External ghost box
+				Box<dim,size_t> g_eg_box = eg_box.get(i).bid.get(j).g_e_box;
+				prp_recv[prp_recv.size()-1] += g_eg_box.getVolumeKey() * sizeof(prp_object) + sizeof(size_t);
+			}
+		}
+
+		size_t tot_recv = ExtPreAlloc<Memory>::calculateMem(prp_recv);
+
+		//! Resize the receiving buffer
+		g_recv_prp_mem.resize(tot_recv);
+
+		// Create an object of preallocated memory for properties
+		(*prRecv_prp) = new ExtPreAlloc<Memory>(tot_recv,g_recv_prp_mem);
+		(*prRecv_prp)->incRef();
+
+		// queue the receives
+		for ( size_t i = 0 ; i < eg_box.size() ; i++ )
+		{
+			(*prRecv_prp)->allocate(prp_recv[i]);
+			v_cl.recv(eg_box.get(i).prc,0,(*prRecv_prp)->getPointer(),prp_recv[i]);
+		}
+	}
+
+	/*! \brief Process the received data
+	 *
+	 * \param eg_box external ghost box
+	 *
+	 */
+	template<int... prp>
+	void process_received(ExtPreAlloc<Memory> * prRecv_prp,
+			   	   	   	  const openfpm::vector<ep_box_grid<dim>> & eg_box,
+						  openfpm::vector<device_grid> & loc_grid,
+						  std::unordered_map<size_t,size_t> & g_id_to_external_ghost_box)
+	{
+		Unpack_stat ps;
+
+		// Unpack the object
+		for ( size_t i = 0 ; i < eg_box.size() ; i++ )
+		{
+			// for each external ghost box
+			for (size_t j = 0 ; j < eg_box.get(i).bid.size() ; j++)
+			{
+				// Unpack the ghost box global-id
+
+				size_t g_id;
+				Unpacker<size_t,HeapMemory>::unpack(*prRecv_prp,g_id,ps);
+
+				size_t l_id = 0;
+				// convert the global id into local id
+				auto key = g_id_to_external_ghost_box.find(g_id);
+				if (key != g_id_to_external_ghost_box.end()) // FOUND
+					l_id = key->second;
+				else
+				{
+					// NOT FOUND
+
+					// It must be always found, if not it mean that the processor has no-idea of
+					// what is stored and conseguently do not know how to unpack, print a critical error
+					// and return
+
+					std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Critical, cannot unpack object, because received data cannot be interpreted\n";
+
+					return;
+				}
+
+				// Get the external ghost box associated with the packed information
+				Box<dim,size_t> box = eg_box.get(i).bid.get(l_id).l_e_box;
+				size_t sub_id = eg_box.get(i).bid.get(l_id).sub;
+
+				// sub-grid where to unpack
+				grid_key_dx_iterator_sub<dim> sub2(loc_grid.get(sub_id).getGrid(),box.getKP1(),box.getKP2());
+
+				// Unpack
+				Unpacker<device_grid,HeapMemory>::template unpack<prp...>(*prRecv_prp,sub2,loc_grid.get(sub_id),ps);
+			}
+		}
+	}
+
 public:
 
 	/*! \brief Reconstruct the local grids
@@ -370,34 +568,19 @@ public:
 
 				while (it.isNext())
 				{
-					//auto key = it.get();
-
-					//if (g.template get<0>(key) != 1)
-						//std::cout << "WRONG???????" << std::endl;
-
 					++it;
 					count++;
 				}
 
 				SpaceBox<dim,long int> b = m_oGrid_recv.get(a).template get<1>(k);
 
-				//device_grid gr_send(sz);
-				//gr_send.setMemory();
-
-				//std::cout << "B: (" << b.getLow(0) << "; " << b.getLow(1) << "); (" << b.getHigh(0) << "; " << b.getHigh(1) << "); " << "G: (" << g.getGrid().getBox().getHigh(0) << "; " << g.getGrid().getBox().getHigh(1) << ")" << std::endl;
-
-				// Set the dimensions of the local grid
-				//g.resize(l_res);
-
 				Point<dim,St> p;
 				for (size_t n = 0; n < dim; n++)
-					p.get(n) = g.getGrid().getBox().getHigh(n);
-
-				//std::cout << "G after resize: (" << g.getGrid().getBox().getLow(0) << "; " << g.getGrid().getBox().getLow(1) << "); (" << g.getGrid().getBox().getHigh(0) << "; " << g.getGrid().getBox().getHigh(1) << ")" << std::endl;
+				{p.get(n) = g.getGrid().getBox().getHigh(n);}
 
 				Point<dim,St> point;
 				for (size_t n = 0; n < dim; n++)
-					point.get(n) = (b.getHigh(n) + b.getLow(n))/2;
+				{point.get(n) = (b.getHigh(n) + b.getLow(n))/2;}
 
 				for (size_t j = 0; j < gdb_ext.size(); j++)
 				{
@@ -422,7 +605,6 @@ public:
 							std::string str = key.to_string();
 							grid_key_dx<dim> key2 = key - start;
 
-							//std::cout << "Key: " << str << std::endl;
 							loc_grid.get(j).get_o(key) = g.get_o(key2);
 							count2++;
 
@@ -432,7 +614,6 @@ public:
 				}
 			}
 		}
-		//std::cout << "Count after: " << count2 << std::endl;
 	}
 
 	/*! \brief Label intersection grids for mappings
@@ -641,115 +822,13 @@ public:
 										 openfpm::vector<device_grid> & loc_grid,
 										 std::unordered_map<size_t,size_t> & g_id_to_external_ghost_box)
 	{
-		// Sending property object
-		typedef object<typename object_creator<typename T::type,prp...>::type> prp_object;
-
 		size_t req = 0;
 
-		// Create a packing request vector
-		for ( size_t i = 0 ; i < ig_box.size() ; i++ )
-		{
-			// for each ghost box
-			for (size_t j = 0 ; j < ig_box.get(i).bid.size() ; j++)
-			{
-				// And linked sub-domain
-				size_t sub_id = ig_box.get(i).bid.get(j).sub;
-				// Internal ghost box
-				Box<dim,long int> g_ig_box = ig_box.get(i).bid.get(j).box;
-
-				if (g_ig_box.isValid() == false)
-					continue;
-
-				g_ig_box -= gdb_ext.get(sub_id).origin.template convertPoint<size_t>();
-
-				// Pack a size_t for the internal ghost id
-				Packer<size_t,HeapMemory>::packRequest(req);
-				// Create a sub grid iterator spanning the internal ghost layer
-				grid_key_dx_iterator_sub<dim> sub_it(loc_grid.get(sub_id).getGrid(),g_ig_box.getKP1(),g_ig_box.getKP2());
-				// and pack the internal ghost grid
-				Packer<device_grid,HeapMemory>::template packRequest<prp...>(loc_grid.get(sub_id),sub_it,req);
-			}
-		}
-
-		// resize the property buffer memory
-		g_send_prp_mem.resize(req);
+		ExtPreAlloc<Memory> * prRecv_prp = NULL;
+		ExtPreAlloc<Memory> * prAlloc_prp = NULL;
 
-		// Create an object of preallocated memory for properties
-		ExtPreAlloc<Memory> & prAlloc_prp = *(new ExtPreAlloc<Memory>(req,g_send_prp_mem));
-
-		prAlloc_prp.incRef();
-
-		// Pack information
-		Pack_stat sts;
-
-		// Pack the information for each processor and send it
-		for ( size_t i = 0 ; i < ig_box.size() ; i++ )
-		{
-
-			sts.mark();
-			void * pointer = prAlloc_prp.getPointerEnd();
-
-			// for each ghost box
-			for (size_t j = 0 ; j < ig_box.get(i).bid.size() ; j++)
-			{
-				// we pack only if it is valid
-				if (ig_box.get(i).bid.get(j).box.isValid() == false)
-					continue;
-
-				// And linked sub-domain
-				size_t sub_id = ig_box.get(i).bid.get(j).sub;
-				// Internal ghost box
-				Box<dim,size_t> g_ig_box = ig_box.get(i).bid.get(j).box;
-				g_ig_box -= gdb_ext.get(sub_id).origin.template convertPoint<size_t>();
-				// Ghost box global id
-				size_t g_id = ig_box.get(i).bid.get(j).g_id;
-
-				// Pack a size_t for the internal ghost id
-				Packer<size_t,HeapMemory>::pack(prAlloc_prp,g_id,sts);
-				// Create a sub grid iterator spanning the internal ghost layer
-				grid_key_dx_iterator_sub<dim> sub_it(loc_grid.get(sub_id).getGrid(),g_ig_box.getKP1(),g_ig_box.getKP2());
-				// and pack the internal ghost grid
-				Packer<device_grid,HeapMemory>::template pack<prp...>(prAlloc_prp,loc_grid.get(sub_id),sub_it,sts);
-			}
-			// send the request
-
-			void * pointer2 = prAlloc_prp.getPointerEnd();
-
-			v_cl.send(ig_box.get(i).prc,0,pointer,(char *)pointer2 - (char *)pointer);
-		}
-
-		// Calculate the total information to receive from each processors
-		std::vector<size_t> prp_recv;
-
-		//! Receive the information from each processors
-		for ( size_t i = 0 ; i < eg_box.size() ; i++ )
-		{
-			prp_recv.push_back(0);
-
-			// for each external ghost box
-			for (size_t j = 0 ; j < eg_box.get(i).bid.size() ; j++)
-			{
-				// External ghost box
-				Box<dim,size_t> g_eg_box = eg_box.get(i).bid.get(j).g_e_box;
-				prp_recv[prp_recv.size()-1] += g_eg_box.getVolumeKey() * sizeof(prp_object) + sizeof(size_t);
-			}
-		}
-
-		size_t tot_recv = ExtPreAlloc<Memory>::calculateMem(prp_recv);
-
-		//! Resize the receiving buffer
-		g_recv_prp_mem.resize(tot_recv);
-
-		// Create an object of preallocated memory for properties
-		ExtPreAlloc<Memory> & prRecv_prp = *(new ExtPreAlloc<Memory>(tot_recv,g_recv_prp_mem));
-		prRecv_prp.incRef();
-
-		// queue the receives
-		for ( size_t i = 0 ; i < eg_box.size() ; i++ )
-		{
-			prRecv_prp.allocate(prp_recv[i]);
-			v_cl.recv(eg_box.get(i).prc,0,prRecv_prp.getPointer(),prp_recv[i]);
-		}
+		if (v_cl.getProcessingUnits() != 1)
+		{send_and_receive_ghost<prp...>(&prAlloc_prp,&prRecv_prp, ig_box,eg_box,gdb_ext,loc_grid,req);}
 
 		// Before wait for the communication to complete we sync the local ghost
 		// in order to overlap with communication
@@ -759,48 +838,8 @@ public:
 		// wait to receive communication
 		v_cl.execute();
 
-		Unpack_stat ps;
-
-		// Unpack the object
-		for ( size_t i = 0 ; i < eg_box.size() ; i++ )
-		{
-			// for each external ghost box
-			for (size_t j = 0 ; j < eg_box.get(i).bid.size() ; j++)
-			{
-				// Unpack the ghost box global-id
-
-				size_t g_id;
-				Unpacker<size_t,HeapMemory>::unpack(prRecv_prp,g_id,ps);
-
-				size_t l_id = 0;
-				// convert the global id into local id
-				auto key = g_id_to_external_ghost_box.find(g_id);
-				if (key != g_id_to_external_ghost_box.end()) // FOUND
-					l_id = key->second;
-				else
-				{
-					// NOT FOUND
-
-					// It must be always found, if not it mean that the processor has no-idea of
-					// what is stored and conseguently do not know how to unpack, print a critical error
-					// and return
-
-					std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Critical, cannot unpack object, because received data cannot be interpreted\n";
-
-					return;
-				}
-
-				// Get the external ghost box associated with the packed information
-				Box<dim,size_t> box = eg_box.get(i).bid.get(l_id).l_e_box;
-				size_t sub_id = eg_box.get(i).bid.get(l_id).sub;
-
-				// sub-grid where to unpack
-				grid_key_dx_iterator_sub<dim> sub2(loc_grid.get(sub_id).getGrid(),box.getKP1(),box.getKP2());
-
-				// Unpack
-				Unpacker<device_grid,HeapMemory>::template unpack<prp...>(prRecv_prp,sub2,loc_grid.get(sub_id),ps);
-			}
-		}
+		if (v_cl.getProcessingUnits() != 1)
+		{process_received<prp...>(prRecv_prp,eg_box,loc_grid,g_id_to_external_ghost_box);}
 	}
 
 	/*! \brief It merge the information in the ghost with the
@@ -849,6 +888,7 @@ public:
 
 				// Pack a size_t for the internal ghost id
 				Packer<size_t,HeapMemory>::packRequest(req);
+
 				// Create a sub grid iterator spanning the internal ghost layer
 				grid_key_dx_iterator_sub<dim> sub_it(loc_grid.get(sub_id).getGrid(),g_eg_box.getKP1(),g_eg_box.getKP2());
 				// and pack the internal ghost grid
@@ -891,6 +931,7 @@ public:
 
 				// Pack a size_t for the internal ghost id
 				Packer<size_t,HeapMemory>::pack(prAlloc_prp,g_id,sts);
+
 				// Create a sub grid iterator spanning the internal ghost layer
 				grid_key_dx_iterator_sub<dim> sub_it(loc_grid.get(sub_id).getGrid(),g_eg_box.getKP1(),g_eg_box.getKP2());
 				// and pack the internal ghost grid
diff --git a/src/Grid/grid_dist_id_unit_test.cpp b/src/Grid/grid_dist_id_unit_test.cpp
index 3bc6ff21359e76240c6e6be94b0b59789c37b2dd..187552054cca9eac1465124d0b5df61cb31f5d4e 100644
--- a/src/Grid/grid_dist_id_unit_test.cpp
+++ b/src/Grid/grid_dist_id_unit_test.cpp
@@ -1,22 +1,14 @@
-#ifndef GRID_DIST_UNIT_TEST_HPP
-#define GRID_DIST_UNIT_TEST_HPP
-
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
 #include "Point_test.hpp"
 #include "grid_dist_id.hpp"
-#include "data_type/scalar.hpp"
 #include "data_type/aggregate.hpp"
 
+extern void print_test_v(std::string test, size_t sz);
 
 BOOST_AUTO_TEST_SUITE( grid_dist_id_test )
 
-void print_test(std::string test, size_t sz)
-{
-	if (create_vcluster().getProcessUnitID() == 0)
-		std::cout << test << " " << sz << "\n";
-}
 
 BOOST_AUTO_TEST_CASE( grid_dist_id_domain_grid_unit_converter3D_test)
 {
@@ -39,7 +31,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_domain_grid_unit_converter3D_test)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing 3D grid converter k<=",k);
+	print_test_v( "Testing 3D grid converter k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -56,7 +48,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_domain_grid_unit_converter3D_test)
 		Ghost<3,float> g(0.01);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<3, float, scalar<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+		grid_dist_id<3, float, aggregate<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 		// get the decomposition
 		auto & dec = g_dist.getDecomposition();
@@ -127,7 +119,7 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_domain_grid_unit_converter_test)
 		Ghost<2,float> g(0.01);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<2, float, scalar<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
+		grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist(sz,domain,g);
 
 		// get the decomposition
 		auto & dec = g_dist.getDecomposition();
@@ -167,7 +159,7 @@ void Test2D(const Box<2,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing 2D grid k<=",k);
+	print_test_v( "Testing 2D grid k<=",k);
 
 	// 2D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -187,7 +179,7 @@ void Test2D(const Box<2,float> & domain, long int k)
 		Ghost<2,float> g(0.01 / factor);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<2, float, scalar<float>> g_dist(sz,domain,g);
+		grid_dist_id<2, float, aggregate<float>> g_dist(sz,domain,g);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -288,7 +280,7 @@ void Test1D(const Box<1,float> & domain, long int k)
 	if (v_cl.getProcessingUnits() > 48)
 		return;
 
-	print_test( "Testing 1D grid k<=",k);
+	print_test_v( "Testing 1D grid k<=",k);
 
 	// 1D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -307,7 +299,7 @@ void Test1D(const Box<1,float> & domain, long int k)
 		Ghost<1,float> g(0.01 / factor);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<1, float, scalar<float>> g_dist(sz,domain,g);
+		grid_dist_id<1, float, aggregate<float>> g_dist(sz,domain,g);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -405,7 +397,7 @@ void Test3D_sub(const Box<3,float> & domain, long int k)
 	if (create_vcluster().getProcessingUnits() > 32)
 		return;
 
-	print_test( "Testing 3D grid sub k<=",k);
+	print_test_v( "Testing 3D grid sub k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -425,7 +417,7 @@ void Test3D_sub(const Box<3,float> & domain, long int k)
 		Ghost<3,float> g(0.01 / factor);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<3, float, scalar<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+		grid_dist_id<3, float, aggregate<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -514,7 +506,7 @@ void Test3D(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing 3D grid k<=",k);
+	print_test_v( "Testing 3D grid k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -534,7 +526,7 @@ void Test3D(const Box<3,float> & domain, long int k)
 		Ghost<3,float> g(0.01 / factor);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<3, float, scalar<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+		grid_dist_id<3, float, aggregate<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -627,7 +619,7 @@ void Test3D_gg(const Box<3,float> & domain, long int k, long int gk)
 	if (create_vcluster().getProcessingUnits() > 32)
 		return;
 
-	print_test( "Testing 3D grid k<=",k);
+	print_test_v( "Testing 3D grid k<=",k);
 
 	// 3D test
 	for ( ; k > 64 ; k /= 2 )
@@ -644,7 +636,7 @@ void Test3D_gg(const Box<3,float> & domain, long int k, long int gk)
 		Ghost<3,long int> g(gk);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<3, float, scalar<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+		grid_dist_id<3, float, aggregate<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 		// check the consistency of the decomposition
 		bool val = g_dist.getDecomposition().check_consistency();
@@ -678,7 +670,7 @@ void Test3D_domain(const Box<3,float> & domain, long int k, const periodicity<3>
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing 3D grid shift domain k<=",k);
+	print_test_v( "Testing 3D grid shift domain k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -783,7 +775,7 @@ void Test2D_complex(const Box<2,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing 2D complex grid k<=",k);
+	print_test_v( "Testing 2D complex grid k<=",k);
 
 	// 2D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -947,7 +939,7 @@ void Test3D_complex(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing 3D grid complex k<=",k);
+	print_test_v( "Testing 3D grid complex k<=",k);
 
 	// 2D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -1113,7 +1105,7 @@ void Test3D_dup(const Box<3,float> & domain, long int k)
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
 
-	print_test( "Testing 3D duplicate topology complex k<=",k);
+	print_test_v( "Testing 3D duplicate topology complex k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -1216,7 +1208,7 @@ void Test3D_periodic(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing grid periodic k<=",k);
+	print_test_v( "Testing grid periodic k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -1373,7 +1365,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing grid periodic put k<=",k);
+	print_test_v( "Testing grid periodic put k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -1493,7 +1485,7 @@ void Test_grid_copy(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing grid copy k<=",k);
+	print_test_v( "Testing grid copy k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -1590,6 +1582,89 @@ void Test_grid_copy(const Box<3,float> & domain, long int k)
 	}
 }
 
+void Test_ghost_correction(Box<3,double> & domain, long int k, long int g_)
+{
+	size_t sz[3] = {(size_t)k,(size_t)k,(size_t)k};
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+
+	Ghost<3,long int> g(g_);
+
+	grid_dist_id<3, double, aggregate<double>> grid(sz,domain,g,bc);
+
+    auto itg = grid.getDomainGhostIterator();
+
+    while (itg.isNext())
+    {
+    	auto key = itg.get();
+
+    	grid.template get<0>(key) = 0.0;
+
+    	++itg;
+    }
+
+	// Fill everything with 5
+
+    auto it = grid.getDomainIterator();
+
+    while (it.isNext())
+    {
+    	auto key = it.get();
+    	auto gkey = it.getGKey(key);
+
+    	if (gkey.get(0) == -4 && gkey.get(1) == 20 && gkey.get(2) == -4)
+    	{
+    		grid.template get<0>(key) = 20.0;
+    	}
+    	else
+    	{
+    		grid.template get<0>(key) = 5.0;
+    	}
+
+    	++it;
+    }
+
+    grid.ghost_get<0>();
+    auto it2 = grid.getDomainGhostIterator();
+
+    bool is_inside = true;
+
+    while (it2.isNext())
+    {
+    	auto key = it2.get();
+    	auto gkey = it2.getGKey(key);
+
+    	if (grid.template get<0>(key) == 5.0)
+    	{
+    		// Here we check that the point is with in one stencil point
+    		// from one sub-domain
+
+    		bool is_inside_point = false;
+    		for (size_t i = 0 ; i < grid.getN_loc_grid() ; i++)
+    		{
+    			Box<3,long int> bx = grid.getLocalGridsInfo().get(i).Dbox;
+    			bx += grid.getLocalGridsInfo().get(i).origin;
+
+    			bx.enlarge(g);
+
+    			if (bx.isInside(gkey.toPoint()) == true)
+    			{
+    				is_inside_point |= true;
+    			}
+    		}
+
+    		is_inside &= is_inside_point;
+    	}
+
+        ++it2;
+    }
+
+
+    grid.getDecomposition().write("dec_set_for_adj");
+    grid.write("dec_for_adj");
+
+    BOOST_REQUIRE_EQUAL(is_inside,true);
+}
+
 #include "grid_dist_id_unit_test_ext_dom.hpp"
 #include "grid_dist_id_unit_test_unb_ghost.hpp"
 
@@ -1756,6 +1831,146 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_periodic_put_test )
 	Test3D_periodic_put(domain3,k);
 }
 
+BOOST_AUTO_TEST_CASE ( grid_ghost_correction )
+{
+	Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5});
+
+	long int k = 128;
+
+	Test_ghost_correction(domain,k,1);
+	Test_ghost_correction(domain,k,2);
+	Test_ghost_correction(domain,k,3);
+	Test_ghost_correction(domain,k,4);
+
+	k = 64;
+
+	Test_ghost_correction(domain,k,1);
+	Test_ghost_correction(domain,k,2);
+	Test_ghost_correction(domain,k,3);
+	Test_ghost_correction(domain,k,4);
+
+	k = 32;
+
+	Test_ghost_correction(domain,k,1);
+	Test_ghost_correction(domain,k,2);
+	Test_ghost_correction(domain,k,3);
+	Test_ghost_correction(domain,k,4);
+
+	k = 16;
+
+	Test_ghost_correction(domain,k,1);
+	Test_ghost_correction(domain,k,2);
+	Test_ghost_correction(domain,k,3);
+	Test_ghost_correction(domain,k,4);
+}
+
+BOOST_AUTO_TEST_CASE ( grid_basic_functions )
+{
+	auto & v_cl = create_vcluster();
+
+	if (v_cl.getProcessingUnits() != 1)
+	{return;}
+
+	size_t sz[2] = {(size_t)8,(size_t)8};
+	periodicity<2> bc = {PERIODIC,PERIODIC};
+
+	Ghost<2,long int> g(1);
+	Box<2,double> domain({-1.0,-1.0},{1.0,1.0});
+
+	grid_dist_id<2, double, aggregate<double>> grid(sz,domain,g,bc);
+
+	BOOST_REQUIRE_EQUAL(grid.getOffset(0)[0],-1.25);
+	BOOST_REQUIRE_EQUAL(grid.getOffset(0)[1],-1.25);
+}
+
+BOOST_AUTO_TEST_CASE ( grid_overflow_round_off_error )
+{
+    size_t numGridPoint     =   100;
+    const double domainSize =   20851.7;
+    double domainLength = sqrt(domainSize);
+
+    Box<2,double> domain({0.0,0.0},{domainLength,domainLength});
+
+    size_t sz[2] = {numGridPoint,numGridPoint};
+
+    periodicity<2> bc = {PERIODIC,PERIODIC};
+
+    Ghost<2,double> g(3.0*(domain.getHigh(0) - domain.getLow(0))/numGridPoint + 0.001);
+
+    grid_dist_id<2, double, aggregate<double, double, double, double, double>> grid(sz,domain,g,bc);
+
+    auto & gs = grid.getGridInfo();
+
+    auto it = grid.getDomainIterator();
+
+    while (it.isNext())
+    {
+    	auto p = it.get();
+    	auto gp = it.getGKey(p);
+
+    	grid.get<0>(p) = gs.LinId(gp);
+
+    	++it;
+    }
+
+    grid.ghost_get<0>();
+
+    // Now we check
+
+    auto it2 = grid.getDomainIterator();
+
+    bool match = true;
+
+    while (it2.isNext())
+    {
+    	auto p = it2.get();
+    	auto gp = it.getGKey(p);
+
+    	if (gs.LinId(gp) != grid.get<0>(p))
+    	{match = false;}
+
+    	// look around
+
+    	auto px = p.move(0,1);
+    	auto gpx = it.getGKey(px);
+    	auto mx = p.move(0,-1);
+    	auto gmx = it.getGKey(mx);
+
+    	auto py = p.move(1,1);
+    	auto gpy = it.getGKey(py);
+    	auto my = p.move(1,-1);
+    	auto gmy = it.getGKey(my);
+
+    	gpx.set_d(0,gpx.get(0) % gs.size(0));
+    	gpx.set_d(1,gpx.get(1) % gs.size(1));
+
+    	if (grid.template get<0>(px) != gs.LinId(gpx))
+    	{match = false;}
+
+    	gmx.set_d(0,(gmx.get(0) + gs.size(0)) % gs.size(0));
+    	gmx.set_d(1,(gmx.get(1) + gs.size(1)) % gs.size(1));
+
+    	if (grid.template get<0>(mx) != gs.LinId(gmx))
+    	{match = false;}
+
+    	gpy.set_d(0,gpy.get(0) % gs.size(0));
+    	gpy.set_d(1,gpy.get(1) % gs.size(1));
+
+    	if (grid.template get<0>(py) != gs.LinId(gpy))
+    	{match = false;}
+
+    	gmy.set_d(0,(gmy.get(0) + gs.size(0)) % gs.size(0));
+    	gmy.set_d(1,(gmy.get(1) + gs.size(1)) % gs.size(1));
+
+    	if (grid.template get<0>(my) != gs.LinId(gmy))
+    	{match = false;}
+
+    	++it2;
+    }
+
+    BOOST_REQUIRE_EQUAL(match,true);
+}
+
+
 BOOST_AUTO_TEST_SUITE_END()
 
-#endif
diff --git a/src/Grid/grid_dist_id_unit_test_ext_dom.hpp b/src/Grid/grid_dist_id_unit_test_ext_dom.hpp
index 3a8baa9e22fe0e79c4baafc5cd01c6e41ff79f01..179e9304d6533b677eb28996f5c6504fa871d960 100644
--- a/src/Grid/grid_dist_id_unit_test_ext_dom.hpp
+++ b/src/Grid/grid_dist_id_unit_test_ext_dom.hpp
@@ -23,7 +23,7 @@ void Test3D_extended_grid(const Box<3,float> & domain, long int k)
 	if ( v_cl.getProcessingUnits() > 32 )
 		return;
 
-	print_test( "Testing 3D extended grid k<=",k);
+	print_test_v( "Testing 3D extended grid k<=",k);
 
 	// factor
 	float factor = pow(create_vcluster().getProcessingUnits()/2.0f,1.0f/3.0f);
diff --git a/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp b/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp
index 51fdbb517a09dbcb08e82b84f23e95e2a2677ae5..d51d25f27b30a5bc511d6c18429e827f3d5b6a33 100644
--- a/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp
+++ b/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp
@@ -17,7 +17,7 @@ void Test3D_unb_ghost(const Box<3,float> & domain, long int k)
 	if (create_vcluster().getProcessingUnits() > 48)
 		return;
 
-	print_test( "Testing 3D grid unbound ghost k<=",k);
+	print_test_v( "Testing 3D grid unbound ghost k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
@@ -34,7 +34,7 @@ void Test3D_unb_ghost(const Box<3,float> & domain, long int k)
 		Ghost<3,float> g(0.49);
 
 		// Distributed grid with id decomposition
-		grid_dist_id<3, float, scalar<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+		grid_dist_id<3, float, aggregate<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 		g_dist.getDecomposition().write("no_bound_decomposition");
 
@@ -132,7 +132,7 @@ void Test3D_unb_ghost_periodic(const Box<3,float> & domain, long int k)
 	big_step = (big_step == 0)?1:big_step;
 	long int small_step = 21;
 
-	print_test( "Testing grid periodic unbound ghost k<=",k);
+	print_test_v( "Testing grid periodic unbound ghost k<=",k);
 
 	// 3D test
 	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
diff --git a/src/Grid/grid_dist_key.hpp b/src/Grid/grid_dist_key.hpp
index 420a281af57106a3529e0a8973b2dc4862c495cc..08b254a78aa9e926a2f3465490d789f449b02b72 100644
--- a/src/Grid/grid_dist_key.hpp
+++ b/src/Grid/grid_dist_key.hpp
@@ -248,4 +248,102 @@ public:
 	}
 };
 
+/*! \brief Distributed linearized key
+ *
+ * instead of having the sub-subdomain index it store directly a pointer to the grid
+ *
+ */
+template<typename device_grid>
+class grid_dist_g_dx
+{
+	//! grid list counter
+	device_grid * dg;
+
+	//! Local grid iterator
+	size_t key;
+
+public:
+
+	/*! \brief return the sub-domain grid
+	 *
+	 *
+	 */
+	inline device_grid * getSub()
+	{
+		return dg;
+	}
+
+
+	/*! \brief Get the key
+	 *
+	 * \return the local key
+	 *
+	 */
+	inline size_t getKey() const
+	{
+		return key;
+	}
+
+
+	/*! \brief Get the reference key
+	 *
+	 * \return the local key
+	 *
+	 */
+	inline size_t & getKeyRef()
+	{
+		return key;
+	}
+
+	/* \brief Check if two key are the same
+	 *
+	 * \param key_t key to check
+	 *
+	 * \return true if the two key are equal
+	 *
+	 */
+
+	inline bool operator==(const grid_dist_g_dx & key_t)
+	{
+		if (dg != key_t.dg)
+			return false;
+
+		// Check the two key index by index
+
+		return getKey() == key_t.getKey();
+	}
+
+
+	/*! \brief Constructor
+	 *
+	 * \param dg array of local grid
+	 * \param key actual position linearized
+	 *
+	 */
+	inline grid_dist_g_dx(device_grid * dg, size_t key)
+	:dg(dg),key(key)
+	{
+	}
+
+	//! Constructor
+	inline grid_dist_g_dx(){}
+
+	/*! \brief convert the key to string
+	 *
+	 * \return a string representing the position
+	 *
+	 */
+	std::string to_string()
+	{
+		std::stringstream str;
+
+		str << "sub_domain=" << dg << " ";
+		str << "lin_id=" << key << " ";
+
+		str << "\n";
+
+		return str.str();
+	}
+};
+
 #endif
diff --git a/src/Grid/grid_dist_util.hpp b/src/Grid/grid_dist_util.hpp
index 77e2383cbdef22ef28ffee634e15d109f3731329..0d38b73a7eceb01b58777da73179323cfd1b8ab9 100644
--- a/src/Grid/grid_dist_util.hpp
+++ b/src/Grid/grid_dist_util.hpp
@@ -80,6 +80,15 @@ template<int dim, typename Decomposition> inline void create_gdb_ext(openfpm::ve
 		SpaceBox<Decomposition::dims, typename Decomposition::stype> sp = dec.getSubDomain(i);
 		SpaceBox<Decomposition::dims, typename Decomposition::stype> sp_g = dec.getSubDomainWithGhost(i);
 
+		// Because of round off we expand for safety the ghost area
+		// std::nextafter return the next bigger or smaller representable floating
+		// point number
+		for (size_t i = 0 ; i < Decomposition::dims ; i++)
+		{
+			sp_g.setLow(i,std::nextafter(sp_g.getLow(i),sp_g.getLow(i) - 1.0));
+			sp_g.setHigh(i,std::nextafter(sp_g.getHigh(i),sp_g.getHigh(i) + 1.0));
+		}
+
 		// Convert from SpaceBox<dim,St> to SpaceBox<dim,long int>
 		SpaceBox<Decomposition::dims,long int> sp_t = cd_sm.convertDomainSpaceIntoGridUnits(sp,dec.periodicity());
 		SpaceBox<Decomposition::dims,long int> sp_tg = cd_sm.convertDomainSpaceIntoGridUnits(sp_g,dec.periodicity());
@@ -124,7 +133,7 @@ template<int dim, typename Decomposition> inline void create_gdb_ext(openfpm::ve
 
 	// fill the spacing
 	for (size_t i = 0 ; i < dim ; i++)
-		spacing[i] = cd_sm.getCellBox().getP2()[i];
+	{spacing[i] = cd_sm.getCellBox().getP2()[i];}
 }
 
 /*! \brief it store a box, its unique id and the sub-domain from where it come from
diff --git a/src/Grid/staggered_dist_grid_util.hpp b/src/Grid/staggered_dist_grid_util.hpp
index 7a7a8d9643aed988a064bb6ee46a5f8138974339..3b2261c2f8df0eaaeff4625ada743c3dd492dfe3 100644
--- a/src/Grid/staggered_dist_grid_util.hpp
+++ b/src/Grid/staggered_dist_grid_util.hpp
@@ -147,7 +147,11 @@ struct extends<T[N1][N2][N3]>
 		return N1 * N2 * N3;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 3
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 3;
@@ -164,7 +168,11 @@ struct extends<T[N1][N2][N3][N4]>
 		return N1 * N2 * N3 * N4;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 4
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 4;
@@ -181,7 +189,11 @@ struct extends<T[N1][N2][N3][N4][N5]>
 		return N1 * N2 * N3 * N4 * N5;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 5
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 5;
@@ -198,7 +210,11 @@ struct extends<T[N1][N2][N3][N4][N5][N6]>
 		return N1 * N2 * N3 * N4 * N5 * N6;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 6
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 6;
@@ -215,7 +231,11 @@ struct extends<T[N1][N2][N3][N4][N5][N6][N7]>
 		return N1 * N2 * N3 * N4 * N5 * N6 * N7;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 7
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 7;
@@ -226,13 +246,21 @@ struct extends<T[N1][N2][N3][N4][N5][N6][N7]>
 template<typename T,size_t N1,size_t N2,size_t N3,size_t N4,size_t N5, size_t N6, size_t N7, size_t N8>
 struct extends<T[N1][N2][N3][N4][N5][N6][N7][N8]>
 {
-	//! number of elements
+	/*! number of elements
+	 *
+	 * \return the number of elements as N1*N2*N3*.........
+	 *
+	 */
 	static inline size_t mul()
 	{
 		return N1 * N2 * N3 * N4 * N5 * N6 * N7 * N8;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 8
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 8;
@@ -243,13 +271,21 @@ struct extends<T[N1][N2][N3][N4][N5][N6][N7][N8]>
 template<typename T,size_t N1,size_t N2,size_t N3,size_t N4,size_t N5, size_t N6, size_t N7, size_t N8, size_t N9>
 struct extends<T[N1][N2][N3][N4][N5][N6][N7][N8][N9]>
 {
-	//! number of elements
+	/*! number of elements
+	 *
+	 * \return the number of elements as N1*N2*N3*.........
+	 *
+	 */
 	static inline size_t mul()
 	{
 		return N1 * N2 * N3 * N4 * N5 * N6 * N7 * N8 * N9;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 9
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 9;
@@ -260,13 +296,21 @@ struct extends<T[N1][N2][N3][N4][N5][N6][N7][N8][N9]>
 template<typename T,size_t N1,size_t N2,size_t N3,size_t N4,size_t N5, size_t N6, size_t N7, size_t N8, size_t N9, size_t N10>
 struct extends<T[N1][N2][N3][N4][N5][N6][N7][N8][N9][N10]>
 {
-	//! number of elements
+	/*! number of elements
+	 *
+	 * \return the number of elements as N1*N2*N3*.........
+	 *
+	 */
 	static inline size_t mul()
 	{
 		return N1 * N2 * N3 * N4 * N5 * N6 * N7 * N8 * N9 * N10;
 	}
 
-	//! number of indexes
+	/*! number of indexes
+	 *
+	 * \return 10
+	 *
+	 */
 	static inline size_t dim()
 	{
 		return 10;
@@ -286,7 +330,7 @@ struct extends<T[N1][N2][N3][N4][N5][N6][N7][N8][N9][N10]>
 template<typename T>
 struct write_stag
 {
-	/*! \brieg write the staggered grid
+	/*! \brief write the staggered grid
 	 *
 	 * \tparam p_val property we are going to write
 	 * \tparam sg staggered grid type
@@ -332,7 +376,7 @@ struct write_stag
 template<typename T,size_t N1>
 struct write_stag<T[N1]>
 {
-	/*! \brieg write the staggered grid
+	/*! \brief write the staggered grid
 	 *
 	 * \tparam p_val property we are going to write
 	 * \tparam sg staggered grid type
@@ -374,7 +418,7 @@ struct write_stag<T[N1]>
 template<typename T,size_t N1,size_t N2>
 struct write_stag<T[N1][N2]>
 {
-	/*! \brieg write the staggered grid
+	/*! \brief write the staggered grid
 	 *
 	 * \tparam p_val property we are going to write
 	 * \tparam sg staggered grid type
@@ -431,15 +475,26 @@ struct write_stag<T[N1][N2]>
 template<unsigned int dim, typename v, bool has_pM = has_posMask<v>::value>
 class stag_set_position
 {
+	//! vector containing the position of the properties in the cells (staggered properties are staggered)
+	// within the cell
 	openfpm::vector<comb<dim>> (& pos_prp)[boost::fusion::result_of::size<v>::type::value];
 
 public:
 
+	/*! \brief Constructor
+	 *
+	 * \param vector of the staggered position (It is going to be filled by this class)
+	 *
+	 */
 	stag_set_position( openfpm::vector<comb<dim>> (& pos_prp)[boost::fusion::result_of::size<v>::type::value])
 	:pos_prp(pos_prp)
 	{}
 
-	//! It call the copy function for each property
+	/*! It calculate the staggered position for every property
+	 *
+	 * \param t property
+	 *
+	 */
 	template<typename T>
 	void operator()(T& t) const
 	{
@@ -519,15 +574,28 @@ template<unsigned int dim, typename v>
 class stag_set_position<dim,v,false>
 {
 private:
+
+	//! vector containing the position of the properties in the cells (staggered properties are staggered)
+	// within the cell
 	openfpm::vector<comb<dim>> (& pos_prp)[boost::fusion::result_of::size<v>::type::value];
 
 
 public:
+
+	/*! \brief Constructor
+	 *
+	 * \param vector of the staggered position (It is going to be filled by this class)
+	 *
+	 */
 	stag_set_position( openfpm::vector<comb<dim>> (& pos_prp)[boost::fusion::result_of::size<v>::type::value])
 	:pos_prp(pos_prp)
 	{}
 
-	//! It call the copy function for each property
+	/*! It calculate the staggered position for every property
+	 *
+	 * \param t property
+	 *
+	 */
 	template<typename T>
 	void operator()(T& t) const
 	{
diff --git a/src/Makefile.am b/src/Makefile.am
index db1aa5ff1ce329b58d4e2da454afd188d22b5328..67fab895782135f5a3c7be00862b33322a84f95c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,18 +1,18 @@
 LINKLIBS = $(HDF5_LDFLAGS)  $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB)  $(METIS_LIB) $(PTHREAD_LIBS) $(OPT_LIBS) $(BOOST_LDFLAGS) $(BOOST_IOSTREAMS_LIB) $(CUDA_LIBS) $(PETSC_LIB) $(PARMETIS_LIB) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(BOOST_CHRONO_LIB) $(BOOST_TIMER_LIB) $(BOOST_SYSTEM_LIB) $(LIBIFCORE)
 
 noinst_PROGRAMS = pdata
-pdata_SOURCES = main.cpp pdata_performance.cpp Grid/grid_dist_id_unit_test.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
+pdata_SOURCES = main.cpp Vector/tests/vector_dist_cell_list_tests.cpp Vector/tests/vector_dist_complex_prp_unit_test.cpp Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp Vector/tests/vector_dist_MP_unit_tests.cpp Vector/tests/vector_dist_NN_tests.cpp Vector/tests/vector_dist_unit_test.cpp  pdata_performance.cpp Decomposition/tests/CartDecomposition_unit_test.cpp Decomposition/tests/shift_vect_converter_tests.cpp Grid/grid_dist_id_unit_test.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 pdata_CXXFLAGS = $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(BOOST_CPPFLAGS) $(H5PART_INCLUDE) -DPARALLEL_IO  -Wno-unused-local-typedefs
 pdata_CFLAGS = $(CUDA_CFLAGS)
 pdata_LDADD = $(LINKLIBS) -lparmetis -lmetis
-nobase_include_HEADERS = Decomposition/CartDecomposition.hpp Decomposition/CartDecomposition_ext.hpp  Decomposition/common.hpp Decomposition/Decomposition.hpp  Decomposition/ie_ghost.hpp \
+nobase_include_HEADERS = Decomposition/CartDecomposition.hpp Decomposition/shift_vect_converter.hpp Decomposition/CartDecomposition_ext.hpp  Decomposition/common.hpp Decomposition/Decomposition.hpp  Decomposition/ie_ghost.hpp \
          Decomposition/Domain_NN_calculator_cart.hpp Decomposition/nn_processor.hpp Decomposition/ie_loc_ghost.hpp Decomposition/ORB.hpp \
          Graph/CartesianGraphFactory.hpp \
          Grid/grid_dist_id.hpp Grid/grid_dist_id_comm.hpp Grid/Iterators/grid_dist_id_iterator_util.hpp Grid/Iterators/grid_dist_id_iterator_dec.hpp Grid/Iterators/grid_dist_id_iterator_dec_skin.hpp Grid/grid_dist_util.hpp  Grid/Iterators/grid_dist_id_iterator_sub.hpp Grid/Iterators/grid_dist_id_iterator.hpp Grid/grid_dist_key.hpp Grid/staggered_dist_grid.hpp Grid/staggered_dist_grid_util.hpp Grid/staggered_dist_grid_copy.hpp \
          Vector/se_class3_vector.hpp  Vector/vector_dist_multiphase_functions.hpp Vector/vector_dist_comm.hpp Vector/vector_dist.hpp Vector/vector_dist_ofb.hpp Vector/Iterators/vector_dist_iterator.hpp Vector/vector_dist_key.hpp \
          config/config.h \
          example.mk \
-         Decomposition/Distribution/metis_util.hpp Decomposition/Distribution/SpaceDistribution.hpp Decomposition/Distribution/parmetis_dist_util.hpp  Decomposition/Distribution/parmetis_util.hpp Decomposition/Distribution/MetisDistribution.hpp Decomposition/Distribution/ParMetisDistribution.hpp Decomposition/Distribution/DistParMetisDistribution.hpp  Decomposition/dec_optimizer.hpp SubdomainGraphNodes.hpp \
+          Decomposition/Distribution/metis_util.hpp Decomposition/Distribution/SpaceDistribution.hpp Decomposition/Distribution/parmetis_dist_util.hpp  Decomposition/Distribution/parmetis_util.hpp Decomposition/Distribution/MetisDistribution.hpp Decomposition/Distribution/ParMetisDistribution.hpp Decomposition/Distribution/DistParMetisDistribution.hpp  Decomposition/dec_optimizer.hpp SubdomainGraphNodes.hpp \
          Graph/ids.hpp Graph/dist_map_graph.hpp Graph/DistGraphFactory.hpp \
          DLB/DLB.hpp DLB/LB_Model.hpp
 
diff --git a/src/SubdomainGraphNodes.hpp b/src/SubdomainGraphNodes.hpp
index 63be7b721bcebe37f18e3512ce5e51cf6f5d1b0d..39b19fa4f1058bd9c136e8194b5ceb3319715a5e 100755
--- a/src/SubdomainGraphNodes.hpp
+++ b/src/SubdomainGraphNodes.hpp
@@ -214,6 +214,11 @@ struct nm_part_v
 		boost::fusion::at_c<1>(data) = p.template get<1>();
 	}
 
+	static inline bool noPointers()
+	{
+		return true;
+	}
+
 };
 
 /*! \brief Reduced edge graph node
@@ -239,6 +244,11 @@ struct nm_part_e
 	{
 		static const std::string name[];
 	};
+
+	static inline bool noPointers()
+	{
+		return true;
+	}
 };
 
 
diff --git a/src/Vector/performance/vector_dist_performance_common.hpp b/src/Vector/performance/vector_dist_performance_common.hpp
index d7988c2c9bdd037f226f0fa2111170ce35c527f7..e6e0d261e4562271b72c57fd160787f0df0a305c 100644
--- a/src/Vector/performance/vector_dist_performance_common.hpp
+++ b/src/Vector/performance/vector_dist_performance_common.hpp
@@ -8,6 +8,7 @@
 #ifndef SRC_VECTOR_PERFORMANCE_VECTOR_DIST_PERFORMANCE_COMMON_HPP_
 #define SRC_VECTOR_PERFORMANCE_VECTOR_DIST_PERFORMANCE_COMMON_HPP_
 
+#include "Vector/vector_dist.hpp"
 
 /*! \brief Calculate and put particles' forces
  *
diff --git a/src/Vector/performance/vector_dist_performance_util.hpp b/src/Vector/performance/vector_dist_performance_util.hpp
index 86a22b82b4c4b59334242fd5ea8500de53658dd9..8f0a07ec9212c7d8f4efb0f4f4dd055564e4d2a5 100644
--- a/src/Vector/performance/vector_dist_performance_util.hpp
+++ b/src/Vector/performance/vector_dist_performance_util.hpp
@@ -70,17 +70,7 @@ static inline void addchartarea(std::string & chart_area, int lvl)
 
 }
 
-void addUpdtateTime(GoogleChart & cg)
-{
-    time_t t = time(0);   // get time now
-    struct tm * now = localtime( & t );
-
-    std::stringstream str;
-
-    str << "<h3>Updated: " << now->tm_mday << "/" << now->tm_mon + 1 << "/" << now->tm_year+1900 << "     " << now->tm_hour << ":" << now->tm_min << ":" << now->tm_sec << std::endl;
-
-	cg.addHTML(str.str());
-}
+void addUpdtateTime(GoogleChart & cg);
 
 /*! \brief Standard deviation
  *
@@ -103,15 +93,6 @@ static inline void standard_deviation(openfpm::vector<double> measures, double &
 	dev = sqrt(dev / (measures.size() - 1));
 }
 
-/*! \brief Print out only ones (no matter how many processors involved)
- *
- * \param test, sz Data to print out
- */
-void print_test_v(std::string test)
-{
-	if (create_vcluster().getProcessUnitID() == 0)
-		std::cout << test  << "\n";
-}
 
 
 /*! \brief Benchmark particles' forces time
@@ -294,7 +275,7 @@ template<unsigned int dim, typename v_dist> void move_particles(v_dist & vd, dou
  *
  *
  */
-void StandardPerformanceGraph(std::string file_mean,
+extern void StandardPerformanceGraph(std::string file_mean,
 		                      std::string file_var,
 							  std::string file_mean_save,
 							  std::string file_var_save,
@@ -306,101 +287,7 @@ void StandardPerformanceGraph(std::string file_mean,
 							  openfpm::vector<std::string> & gnames,
 							  std::string x_string,
 							  std::string y_string,
-							  bool use_log)
-{
-	openfpm::vector<openfpm::vector<openfpm::vector<double>>> y_ref_mean;
-	openfpm::vector<openfpm::vector<openfpm::vector<double>>> y_ref_dev;
-	y_ref_mean.load(file_mean);
-	y_ref_dev.load(file_var);
-
-	// warning level
-	openfpm::vector<int> warning_vlevel;
-
-	// Calculation time graphs data
-
-	openfpm::vector<size_t> x;
-	openfpm::vector<openfpm::vector<openfpm::vector<double>>> y2;
-	openfpm::vector<openfpm::vector<openfpm::vector<double>>> y2_dev;
-	openfpm::vector<std::string> yn2;
-
-	if (names.size() == 0)
-		return;
-
-	for (size_t i = 0 ; i < names.size() ; i++)
-		yn2.add(names.get(i));
-
-	for (size_t i = 0; i < xp.size() ; i++)
-		x.add(xp.get(i));
-
-	yp_mean.save(file_mean_save);
-	yp_dev.save(file_var_save);
-
-	if (y_ref_mean.size() != 0 && yp_mean.size() != 0 && yp_mean.get(0).size() != 0)
-	{
-		// We reconstruct y and yn
-
-		y2.clear();
-		yn2.clear();
-
-		for (size_t i = 0 ; i < yp_mean.get(0).get(0).size() ; i++)
-		{
-			yn2.add(names.get(i));
-			yn2.add("interval");
-			yn2.add("interval");
-		}
-
-		y2.resize(yp_mean.size());
-		for (size_t r = 0; r < yp_mean.size(); r++)
-		{
-			int warning_level = -1;
-
-			y2.get(r).resize(yp_mean.get(r).size());
-			for (size_t k = 0; k < yp_mean.get(r).size(); k++)
-			{
-
-				// Number of graph points
-				for (size_t g = 0 ; g < yp_mean.get(r).get(k).size() ; g++)
-				{
-					// Time for construction hilbert and random
-					y2.get(r).get(k).add(yp_mean.get(r).get(k).get(g));
-					y2.get(r).get(k).add(y_ref_mean.get(r).get(k).get(g) - 3.0*y_ref_dev.get(r).get(k).get(g));
-					y2.get(r).get(k).add(y_ref_mean.get(r).get(k).get(g) + 3.0*y_ref_dev.get(r).get(k).get(g));
+							  bool use_log);
 
-					warning_set(warning_level,yp_mean.get(r).get(k).get(g),y_ref_mean.get(r).get(k).get(g),y_ref_dev.get(r).get(k).get(g));
-				}
-			}
-
-			warning_vlevel.add(warning_level);
-		}
-	}
-	else
-	{
-		return;
-	}
-
-	// Calculation time graphs report
-
-	// Google charts options
-	GCoptions options2;
-
-	options2.yAxis = std::string(y_string);
-	options2.xAxis = std::string(x_string);
-	options2.lineWidth = 4;
-
-	for (size_t i = 0; i < y2.size() ; i++)
-	{
-		std::string chart_area;
-		if (warning_vlevel.size() != 0)
-			addchartarea(chart_area,warning_vlevel.get(i));
-
-		if (use_log == true)
-		{options2.more = GC_Y_LOG + "," + GC_ZOOM + chart_area;}
-		else
-		{options2.more = GC_ZOOM + chart_area;}
-
-		options2.title = gnames.get(i);
-		cg.AddLinesGraph(x,y2.get(i),yn2,options2);
-	}
-}
 
 #endif /* SRC_VECTOR_VECTOR_DIST_PERFORMANCE_UTIL_HPP_ */
diff --git a/src/Vector/vector_dist_HDF5_chckpnt_restart_test.hpp b/src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
similarity index 85%
rename from src/Vector/vector_dist_HDF5_chckpnt_restart_test.hpp
rename to src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
index 66ece4bd4d9292da3fbf4f1a34da868dbb08597b..a843c42ef5d4e9c32b5b212f911ad18aae8da084 100644
--- a/src/Vector/vector_dist_HDF5_chckpnt_restart_test.hpp
+++ b/src/Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp
@@ -4,16 +4,15 @@
  *  Created on: Jun 12, 2016
  *      Author: Yaroslav Zaluzhnyi
  */
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
 
-#ifndef SRC_VECTOR_VECTOR_DIST_HDF5_CHCKPNT_RESTART_TEST_HPP_
-#define SRC_VECTOR_VECTOR_DIST_HDF5_CHCKPNT_RESTART_TEST_HPP_
-
-#include "vector_dist.hpp"
+#include "Vector/vector_dist.hpp"
 #include "Packer_Unpacker/Pack_selector.hpp"
 #include "Packer_Unpacker/Packer.hpp"
 #include "Packer_Unpacker/Unpacker.hpp"
 #include "Vector/performance/vector_dist_performance_util.hpp"
-
+#include "NN/CellList/CellList_util.hpp"
 
 #include "hdf5.h"
 
@@ -46,7 +45,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_hdf5_save_test )
 	// ghost
 	Ghost<dim,float> ghost(1.0/(Ng-2));
 
-	vector_dist<dim,float, aggregate<float[dim]>, CartDecomposition<dim,float> > vd(0,box,bc,ghost);
+	vector_dist<dim,float, aggregate<float[dim]> > vd(0,box,bc,ghost);
 
 	// Put particles
 
@@ -85,7 +84,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_hdf5_save_test )
 	// Save the vector
     vd.save("vector_dist.h5");
 
-    vector_dist<dim,float, aggregate<float[dim]>, CartDecomposition<dim,float> > vd2(0,box,bc,ghost);
+    vector_dist<dim,float, aggregate<float[dim]> > vd2(0,box,bc,ghost);
 
     vd2.load("vector_dist.h5");
 
@@ -143,7 +142,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_hdf5_load_test )
 	// ghost
 	Ghost<dim,float> ghost(1.0/(Ng-2));
 
-	vector_dist<dim,float, aggregate<float[dim]>, CartDecomposition<dim,float> > vd(0,box,bc,ghost);
+	vector_dist<dim,float, aggregate<float[dim]> > vd(0,box,bc,ghost);
 
 	// Load the vector
     vd.load("test_data/vector_dist_24.h5");
@@ -177,5 +176,3 @@ BOOST_AUTO_TEST_CASE( vector_dist_hdf5_load_test )
 
 BOOST_AUTO_TEST_SUITE_END()
 
-
-#endif /* SRC_VECTOR_VECTOR_DIST_HDF5_CHCKPNT_RESTART_TEST_HPP_ */
diff --git a/src/Vector/vector_dist_MP_unit_tests.hpp b/src/Vector/tests/vector_dist_MP_unit_tests.cpp
similarity index 98%
rename from src/Vector/vector_dist_MP_unit_tests.hpp
rename to src/Vector/tests/vector_dist_MP_unit_tests.cpp
index 807b7a259f435db2839f345097e2e343d38542f8..381da62217d93ea2605ed84157085f203d4de964 100644
--- a/src/Vector/vector_dist_MP_unit_tests.hpp
+++ b/src/Vector/tests/vector_dist_MP_unit_tests.cpp
@@ -5,10 +5,12 @@
  *      Author: i-bird
  */
 
-#ifndef SRC_VECTOR_VECTOR_DIST_MP_UNIT_TESTS_HPP_
-#define SRC_VECTOR_VECTOR_DIST_MP_UNIT_TESTS_HPP_
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
 
 #include "Vector/vector_dist_multiphase_functions.hpp"
+#include "VCluster/VCluster.hpp"
+#include "Vector/vector_dist.hpp"
 
 BOOST_AUTO_TEST_SUITE( vector_dist_multiphase_test )
 
@@ -426,4 +428,3 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_cell_list_sym_test )
 
 BOOST_AUTO_TEST_SUITE_END()
 
-#endif /* SRC_VECTOR_VECTOR_DIST_MP_UNIT_TESTS_HPP_ */
diff --git a/src/Vector/vector_dist_NN_tests.hpp b/src/Vector/tests/vector_dist_NN_tests.cpp
similarity index 88%
rename from src/Vector/vector_dist_NN_tests.hpp
rename to src/Vector/tests/vector_dist_NN_tests.cpp
index 0b38a0e7a24c816185213c6d389bb74ebf06efc1..9c6c79f1a849a3ef7ed4a642b53ba9fa878fe535 100644
--- a/src/Vector/vector_dist_NN_tests.hpp
+++ b/src/Vector/tests/vector_dist_NN_tests.cpp
@@ -5,11 +5,16 @@
  *      Author: i-bird
  */
 
-#ifndef SRC_VECTOR_VECTOR_DIST_NN_TESTS_HPP_
-#define SRC_VECTOR_VECTOR_DIST_NN_TESTS_HPP_
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
 
+#include "VCluster/VCluster.hpp"
+#include "Vector/vector_dist.hpp"
 
-BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
+extern void print_test_v(std::string test, size_t sz);
+
+template<typename VerletList>
+void test_full_nn(long int k)
 {
 	Vcluster & v_cl = create_vcluster();
 
@@ -22,16 +27,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
     std::default_random_engine eg;
     std::uniform_real_distribution<float> ud(0.0f, 1.0f);
 
-#ifdef TEST_COVERAGE_MODE
-    long int k = 50 * v_cl.getProcessingUnits();
-#else
-    long int k = 750 * v_cl.getProcessingUnits();
-#endif
-
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D full NN search k=",k);
+	print_test_v("Testing 3D full NN search k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D full NN search k=" << k );
 
 	Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
@@ -134,14 +133,14 @@ BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
 
 		///////////////////////////////////
 
-		auto NNv = vd.getVerlet(r_cut*1.0001);
+		auto NNv = vd.template getVerlet<VerletList>(r_cut*1.0001);
 
 		it = vd.getDomainIterator();
 
 		while (it.isNext())
 		{
 			Point<3,float> xp = vd.getPos(it.get());
-			auto Np = NNv.getNNIterator<NO_CHECK>(it.get().getKey());
+			auto Np = NNv.template getNNIterator<NO_CHECK>(it.get().getKey());
 
 			list_idx2.get(it.get().getKey()).clear();
 
@@ -185,7 +184,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
 		while (it.isNext())
 		{
 			Point<3,float> xp = vd.getPos(it.get());
-			auto Np = NNv.getNNIterator<NO_CHECK>(it.get().getKey());
+			auto Np = NNv.template getNNIterator<NO_CHECK>(it.get().getKey());
 
 			list_idx2.get(it.get().getKey()).clear();
 
@@ -221,6 +220,24 @@ BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
 	}
 }
 
+BOOST_AUTO_TEST_CASE( vector_dist_full_NN )
+{
+	auto & v_cl = create_vcluster();
+
+#ifdef TEST_COVERAGE_MODE
+    long int k = 50 * v_cl.getProcessingUnits();
+#else
+    long int k = 750 * v_cl.getProcessingUnits();
+#endif
+
+	test_full_nn<VERLET_MEMFAST(3,float)>(k);
+
+	k /= 2;
+	test_full_nn<VERLET_MEMBAL(3,float)>(k);
+	k /= 2;
+	test_full_nn<VERLET_MEMMW(3,float)>(k);
+}
+
 BOOST_AUTO_TEST_CASE( vector_dist_particle_iteration )
 {
 	Vcluster & v_cl = create_vcluster();
@@ -236,7 +253,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_iteration )
 
     long int k = 750 * v_cl.getProcessingUnits();
 
-	print_test("Testing 3D particle cell iterator=",k);
+	print_test_v("Testing 3D particle cell iterator=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D full NN search k=" << k );
 
 	Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
@@ -302,4 +319,3 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_iteration )
 	BOOST_REQUIRE_EQUAL((long int)count,k);
 }
 
-#endif /* SRC_VECTOR_VECTOR_DIST_NN_TESTS_HPP_ */
diff --git a/src/Vector/vector_dist_cell_list_tests.hpp b/src/Vector/tests/vector_dist_cell_list_tests.cpp
similarity index 83%
rename from src/Vector/vector_dist_cell_list_tests.hpp
rename to src/Vector/tests/vector_dist_cell_list_tests.cpp
index 0962a3cd5f2439fb252c01aceefe21eaeeb195a8..e8875868c2af684cb2ac093fd1c2bd91b7e790ef 100644
--- a/src/Vector/vector_dist_cell_list_tests.hpp
+++ b/src/Vector/tests/vector_dist_cell_list_tests.cpp
@@ -6,14 +6,18 @@
  */
 
 #include "config.h"
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include "Point_test.hpp"
+#include "Vector/performance/vector_dist_performance_common.hpp"
+#include "Vector/vector_dist.hpp"
 
-#ifndef SRC_VECTOR_VECTOR_DIST_CELL_LIST_TESTS_HPP_
-#define SRC_VECTOR_VECTOR_DIST_CELL_LIST_TESTS_HPP_
-
+extern void print_test_v(std::string test, size_t sz);
+extern long int decrement(long int k, long int step);
 
 ///////////////////////// test hilb ///////////////////////////////
 
-BOOST_AUTO_TEST_CASE( vector_dist_reorder_2d_test )
+void test_reorder_sfc(reorder_opt opt)
 {
 	Vcluster & v_cl = create_vcluster();
 
@@ -35,19 +39,19 @@ BOOST_AUTO_TEST_CASE( vector_dist_reorder_2d_test )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test_v( "Testing 2D vector with hilbert curve reordering k<=",k);
+	print_test_v( "Testing 2D vector with sfc curve reordering k<=",k);
 
 	// 2D test
 	for ( ; k >= 2 ; k-= decrement(k,big_step) )
 	{
-		BOOST_TEST_CHECKPOINT( "Testing 2D vector with hilbert curve reordering k=" << k );
+		BOOST_TEST_CHECKPOINT( "Testing 2D vector with sfc curve reordering k=" << k );
 
 		Box<2,float> box({0.0,0.0},{1.0,1.0});
 
 		// Boundary conditions
 		size_t bc[2]={NON_PERIODIC,NON_PERIODIC};
 
-		vector_dist<2,float, Point_test<float>, CartDecomposition<2,float> > vd(k,box,bc,Ghost<2,float>(0.01));
+		vector_dist<2,float, Point_test<float> > vd(k,box,bc,Ghost<2,float>(0.01));
 
 		auto it = vd.getIterator();
 
@@ -73,7 +77,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_reorder_2d_test )
 		int32_t m = 6;
 
 		//Reorder a vector
-		vd.reorder(m);
+		vd.reorder(m,opt);
 
 		// Create second cell list
 		auto NN2 = vd.getCellList(0.01,true);
@@ -89,6 +93,12 @@ BOOST_AUTO_TEST_CASE( vector_dist_reorder_2d_test )
 	}
 }
 
+BOOST_AUTO_TEST_CASE( vector_dist_reorder_2d_test )
+{
+	test_reorder_sfc(reorder_opt::HILBERT);
+	test_reorder_sfc(reorder_opt::LINEAR);
+}
+
 BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_hilb_forces_test )
 {
 	Vcluster & v_cl = create_vcluster();
@@ -122,7 +132,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_hilb_forces_test )
 
 		std::string str("Testing " + std::to_string(dim) + "D vector's forces (random vs hilb celllist) k<=");
 
-		vector_dist_test::print_test_v(str,k);
+		print_test_v(str,k);
 
 		//For different number of particles
 		for (size_t k_int = k ; k_int >= cl_k_min ; k_int/=2 )
@@ -143,9 +153,9 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_hilb_forces_test )
 			for (size_t i = 0; i < dim; i++)
 				bc[i] = PERIODIC;
 
-			vector_dist<dim,float, aggregate<float[dim]>, CartDecomposition<dim,float> > vd(k_int,box,bc,Ghost<dim,float>(ghost_part));
+			vector_dist<dim,float, aggregate<float[dim]> > vd(k_int,box,bc,Ghost<dim,float>(ghost_part));
 
-			vector_dist<dim,float, aggregate<float[dim]>, CartDecomposition<dim,float> > vd2(k_int,box,bc,Ghost<dim,float>(ghost_part));
+			vector_dist<dim,float, aggregate<float[dim]> > vd2(k_int,box,bc,Ghost<dim,float>(ghost_part));
 
 			// Initialize dist vectors
 			vd_initialize_double<dim>(vd, vd2, v_cl, k_int);
@@ -180,7 +190,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_hilb_forces_test )
 				vect_dist_key_dx key = it_v2.get();
 
 				for (size_t i = 0; i < dim; i++)
-					avg.get(i) += fabs(vd.getProp<0>(key)[i]);
+				{avg.get(i) += fabs(vd.getProp<0>(key)[i]);}
 
 				++count;
 				++it_v2;
@@ -246,7 +256,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_reorder_forces_test )
 
 		std::string str("Testing " + std::to_string(dim) + "D vector's forces (random vs reorder) k<=");
 
-		vector_dist_test::print_test_v(str,k);
+		print_test_v(str,k);
 
 		//For different number of particles
 		for (size_t k_int = k ; k_int >= cl_k_min ; k_int/=2 )
@@ -267,7 +277,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cl_random_vs_reorder_forces_test )
 			for (size_t i = 0; i < dim; i++)
 				bc[i] = PERIODIC;
 
-			vector_dist<dim,float, aggregate<float[dim], float[dim]>, CartDecomposition<dim,float> > vd(k_int,box,bc,Ghost<dim,float>(ghost_part));
+			vector_dist<dim,float, aggregate<float[dim], float[dim]> > vd(k_int,box,bc,Ghost<dim,float>(ghost_part));
 
 			// Initialize vd
 			vd_initialize<dim,decltype(vd)>(vd, v_cl, k_int);
@@ -360,7 +370,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_cell_list )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -569,7 +579,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_cell_list )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric crs cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric crs cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric crs cell-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -776,7 +786,8 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_cell_list )
 	BOOST_REQUIRE_EQUAL(ret,true);
 }
 
-BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
+template<typename VerletList>
+void test_vd_symmetric_verlet_list()
 {
 	Vcluster & v_cl = create_vcluster();
 
@@ -796,7 +807,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric verlet-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -839,9 +850,9 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 
 		// Fill some properties randomly
 
-		vd.getPropWrite<0>(key) = 0;
-		vd.getPropWrite<1>(key) = 0;
-		vd.getPropWrite<2>(key) = key.getKey() + start;
+		vd.template getPropWrite<0>(key) = 0;
+		vd.template getPropWrite<1>(key) = 0;
+		vd.template getPropWrite<2>(key) = key.getKey() + start;
 
 		++it;
 	}
@@ -849,9 +860,9 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 	vd.map();
 
 	// sync the ghost
-	vd.ghost_get<0,2>();
+	vd.template ghost_get<0,2>();
 
-	auto NN = vd.getVerlet(r_cut);
+	auto NN = vd.template getVerlet<VerletList>(r_cut);
 	auto p_it = vd.getDomainIterator();
 
 	while (p_it.isNext())
@@ -883,10 +894,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 
 			if (distance < r_cut )
 			{
-				vd.getPropWrite<0>(p)++;
-				vd.getPropWrite<3>(p).add();
-				vd.getPropWrite<3>(p).last().xq = xq;
-				vd.getPropWrite<3>(p).last().id = vd.getPropRead<2>(q);
+				vd.template getPropWrite<0>(p)++;
+				vd.template getPropWrite<3>(p).add();
+				vd.template getPropWrite<3>(p).last().xq = xq;
+				vd.template getPropWrite<3>(p).last().id = vd.template getPropRead<2>(q);
 			}
 
 			++Np;
@@ -897,7 +908,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 
 	// We now try symmetric  Cell-list
 
-	auto NN2 = vd.getVerletSym(r_cut);
+	auto NN2 = vd.template getVerletSym<VerletList>(r_cut);
 
 	auto p_it2 = vd.getDomainIterator();
 
@@ -907,7 +918,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 
 		Point<3,float> xp = vd.getPosRead(p);
 
-		auto Np = NN2.getNNIterator<NO_CHECK>(p.getKey());
+		auto Np = NN2.template getNNIterator<NO_CHECK>(p.getKey());
 
 		while (Np.isNext())
 		{
@@ -930,16 +941,16 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 
 			if (distance < r_cut )
 			{
-				vd.getPropWrite<1>(p)++;
-				vd.getPropWrite<1>(q)++;
+				vd.template getPropWrite<1>(p)++;
+				vd.template getPropWrite<1>(q)++;
 
-				vd.getPropWrite<4>(p).add();
-				vd.getPropWrite<4>(q).add();
+				vd.template getPropWrite<4>(p).add();
+				vd.template getPropWrite<4>(q).add();
 
-				vd.getPropWrite<4>(p).last().xq = xq;
-				vd.getPropWrite<4>(q).last().xq = xp;
-				vd.getPropWrite<4>(p).last().id = vd.getPropRead<2>(q);
-				vd.getPropWrite<4>(q).last().id = vd.getPropRead<2>(p);
+				vd.template getPropWrite<4>(p).last().xq = xq;
+				vd.template getPropWrite<4>(q).last().xq = xp;
+				vd.template getPropWrite<4>(p).last().id = vd.template getPropRead<2>(q);
+				vd.template getPropWrite<4>(q).last().id = vd.template getPropRead<2>(p);
 			}
 
 			++Np;
@@ -948,8 +959,8 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 		++p_it2;
 	}
 
-	vd.ghost_put<add_,1>();
-	vd.ghost_put<merge_,4>();
+	vd.template ghost_put<add_,1>();
+	vd.template ghost_put<merge_,4>();
 
 	auto p_it3 = vd.getDomainIterator();
 
@@ -958,15 +969,15 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 	{
 		auto p = p_it3.get();
 
-		ret &= vd.getPropRead<1>(p) == vd.getPropRead<0>(p);
+		ret &= vd.template getPropRead<1>(p) == vd.template getPropRead<0>(p);
 
-		vd.getPropWrite<3>(p).sort();
-		vd.getPropWrite<4>(p).sort();
+		vd.template getPropWrite<3>(p).sort();
+		vd.template getPropWrite<4>(p).sort();
 
-		ret &= vd.getPropRead<3>(p).size() == vd.getPropRead<4>(p).size();
+		ret &= vd.template getPropRead<3>(p).size() == vd.template getPropRead<4>(p).size();
 
-		for (size_t i = 0 ; i < vd.getPropRead<3>(p).size() ; i++)
-			ret &= vd.getPropRead<3>(p).get(i).id == vd.getPropRead<4>(p).get(i).id;
+		for (size_t i = 0 ; i < vd.template getPropRead<3>(p).size() ; i++)
+			ret &= vd.template getPropRead<3>(p).get(i).id == vd.template getPropRead<4>(p).get(i).id;
 
 		if (ret == false)
 			break;
@@ -977,7 +988,15 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
 	BOOST_REQUIRE_EQUAL(ret,true);
 }
 
-BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
+BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list )
+{
+	test_vd_symmetric_verlet_list<VERLET_MEMFAST(3,float)>();
+	test_vd_symmetric_verlet_list<VERLET_MEMBAL(3,float)>();
+	test_vd_symmetric_verlet_list<VERLET_MEMMW(3,float)>();
+}
+
+template<typename VerletList>
+void vector_sym_verlet_list_nb()
 {
 	Vcluster & v_cl = create_vcluster();
 
@@ -997,7 +1016,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list no bottom k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list no bottom k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list no bottom k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -1051,13 +1070,13 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 
 			// Fill some properties randomly
 
-			vd.getPropWrite<0>(key) = 0;
-			vd.getPropWrite<1>(key) = 0;
-			vd.getPropWrite<2>(key) = key.getKey() + start;
+			vd.template getPropWrite<0>(key) = 0;
+			vd.template getPropWrite<1>(key) = 0;
+			vd.template getPropWrite<2>(key) = key.getKey() + start;
 
-			vd2.getPropWrite<0>(key) = 0;
-			vd2.getPropWrite<1>(key) = 0;
-			vd2.getPropWrite<2>(key) = key.getKey() + start;
+			vd2.template getPropWrite<0>(key) = 0;
+			vd2.template getPropWrite<1>(key) = 0;
+			vd2.template getPropWrite<2>(key) = key.getKey() + start;
 
 			++it;
 		}
@@ -1066,10 +1085,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 		vd2.map();
 
 		// sync the ghost
-		vd.ghost_get<0,2>();
-		vd2.ghost_get<0,2>();
+		vd.template ghost_get<0,2>();
+		vd2.template ghost_get<0,2>();
 
-		auto NN = vd.getVerlet(r_cut);
+		auto NN = vd.template getVerlet<VerletList>(r_cut);
 		auto p_it = vd.getDomainIterator();
 
 		while (p_it.isNext())
@@ -1101,10 +1120,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 
 				if (distance < r_cut )
 				{
-					vd.getPropWrite<0>(p)++;
-					vd.getPropWrite<3>(p).add();
-					vd.getPropWrite<3>(p).last().xq = xq;
-					vd.getPropWrite<3>(p).last().id = vd.getPropRead<2>(q);
+					vd.template getPropWrite<0>(p)++;
+					vd.template getPropWrite<3>(p).add();
+					vd.template getPropWrite<3>(p).last().xq = xq;
+					vd.template getPropWrite<3>(p).last().id = vd.template getPropRead<2>(q);
 				}
 
 				++Np;
@@ -1115,7 +1134,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 
 		// We now try symmetric  Cell-list
 
-		auto NN2 = vd2.getVerletSym(r_cut);
+		auto NN2 = vd2.template getVerletSym<VerletList>(r_cut);
 
 		auto p_it2 = vd2.getDomainIterator();
 
@@ -1125,7 +1144,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 
 			Point<3,float> xp = vd2.getPosRead(p);
 
-			auto Np = NN2.getNNIterator<NO_CHECK>(p.getKey());
+			auto Np = NN2.template getNNIterator<NO_CHECK>(p.getKey());
 
 			while (Np.isNext())
 			{
@@ -1148,16 +1167,16 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 
 				if (distance < r_cut )
 				{
-					vd2.getPropWrite<1>(p)++;
-					vd2.getPropWrite<1>(q)++;
+					vd2.template getPropWrite<1>(p)++;
+					vd2.template getPropWrite<1>(q)++;
 
-					vd2.getPropWrite<4>(p).add();
-					vd2.getPropWrite<4>(q).add();
+					vd2.template getPropWrite<4>(p).add();
+					vd2.template getPropWrite<4>(q).add();
 
-					vd2.getPropWrite<4>(p).last().xq = xq;
-					vd2.getPropWrite<4>(q).last().xq = xp;
-					vd2.getPropWrite<4>(p).last().id = vd2.getPropRead<2>(q);
-					vd2.getPropWrite<4>(q).last().id = vd2.getPropRead<2>(p);
+					vd2.template getPropWrite<4>(p).last().xq = xq;
+					vd2.template getPropWrite<4>(q).last().xq = xp;
+					vd2.template getPropWrite<4>(p).last().id = vd2.template getPropRead<2>(q);
+					vd2.template getPropWrite<4>(q).last().id = vd2.template getPropRead<2>(p);
 				}
 
 				++Np;
@@ -1167,8 +1186,8 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 			++p_it2;
 		}
 
-		vd2.ghost_put<add_,1>();
-		vd2.ghost_put<merge_,4>();
+		vd2.template ghost_put<add_,1>();
+		vd2.template ghost_put<merge_,4>();
 
 #ifdef SE_CLASS3
 		vd2.getDomainIterator();
@@ -1181,16 +1200,15 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 		{
 			auto p = p_it3.get();
 
-			ret &= vd2.getPropRead<1>(p) == vd.getPropRead<0>(p);
-
+			ret &= vd2.template getPropRead<1>(p) == vd.template getPropRead<0>(p);
 
-			vd.getPropWrite<3>(p).sort();
-			vd2.getPropWrite<4>(p).sort();
+			vd.template getPropWrite<3>(p).sort();
+			vd2.template getPropWrite<4>(p).sort();
 
-			ret &= vd.getPropRead<3>(p).size() == vd2.getPropRead<4>(p).size();
+			ret &= vd.template getPropRead<3>(p).size() == vd2.template getPropRead<4>(p).size();
 
-			for (size_t i = 0 ; i < vd.getPropRead<3>(p).size() ; i++)
-				ret &= vd.getPropRead<3>(p).get(i).id == vd2.getPropRead<4>(p).get(i).id;
+			for (size_t i = 0 ; i < vd.template getPropRead<3>(p).size() ; i++)
+				ret &= vd.template getPropRead<3>(p).get(i).id == vd2.template getPropRead<4>(p).get(i).id;
 
 			if (ret == false)
 				break;
@@ -1202,7 +1220,18 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
 	}
 }
 
-template<typename part_prop> void test_crs_full(vector_dist<3,float, part_prop > & vd,
+BOOST_AUTO_TEST_CASE( vector_dist_symmetric_verlet_list_no_bottom )
+{
+	vector_sym_verlet_list_nb<VERLET_MEMFAST(3,float)>();
+	vector_sym_verlet_list_nb<VERLET_MEMBAL(3,float)>();
+	vector_sym_verlet_list_nb<VERLET_MEMMW(3,float)>();
+
+	vector_sym_verlet_list_nb<VERLET_MEMFAST_INT(3,float)>();
+	vector_sym_verlet_list_nb<VERLET_MEMBAL_INT(3,float)>();
+	vector_sym_verlet_list_nb<VERLET_MEMMW_INT(3,float)>();
+}
+
+template<typename VerletList, typename part_prop> void test_crs_full(vector_dist<3,float, part_prop > & vd,
 		                                        vector_dist<3,float, part_prop > & vd2,
 												std::default_random_engine & eg,
 												std::uniform_real_distribution<float> & ud,
@@ -1243,7 +1272,7 @@ template<typename part_prop> void test_crs_full(vector_dist<3,float, part_prop >
 	vd.template ghost_get<0,2>();
 	vd2.template ghost_get<0,2>();
 
-	auto NN = vd.getVerlet(r_cut);
+	auto NN = vd.template getVerlet<VerletList>(r_cut);
 	auto p_it = vd.getDomainIterator();
 
 	while (p_it.isNext())
@@ -1289,7 +1318,7 @@ template<typename part_prop> void test_crs_full(vector_dist<3,float, part_prop >
 
 	// We now try symmetric Verlet-list Crs scheme
 
-	auto NN2 = vd2.getVerletCrs(r_cut);
+	auto NN2 = vd2.template getVerletCrs<VerletList>(r_cut);
 
 	// Because iterating across particles in the CSR scheme require a Cell-list
 	auto p_it2 = vd2.getParticleIteratorCRS_Cell(NN2.getInternalCellList());
@@ -1378,7 +1407,7 @@ template<typename part_prop> void test_crs_full(vector_dist<3,float, part_prop >
 	BOOST_REQUIRE_EQUAL(ret,true);
 }
 
-
+template<typename VerletList>
 void test_csr_verlet_list()
 {
 	Vcluster & v_cl = create_vcluster();
@@ -1399,7 +1428,7 @@ void test_csr_verlet_list()
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -1435,9 +1464,10 @@ void test_csr_verlet_list()
 	vector_dist<3,float, part_prop > vd2(k,box,bc,ghost2,BIND_DEC_TO_GHOST);
 	size_t start = vd.init_size_accum(k);
 
-	test_crs_full(vd,vd2,eg,ud,start,r_cut);
+	test_crs_full<VerletList>(vd,vd2,eg,ud,start,r_cut);
 }
 
+template<typename VerletList>
 void test_csr_verlet_list_override()
 {
 	Vcluster & v_cl = create_vcluster();
@@ -1458,7 +1488,7 @@ void test_csr_verlet_list_override()
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -1508,20 +1538,25 @@ void test_csr_verlet_list_override()
 	vector_dist<3,float, part_prop > vd2(k,box,bc,ghost2,BIND_DEC_TO_GHOST,gdist2_d);
 	size_t start = vd.init_size_accum(k);
 
-	test_crs_full(vd,vd2,eg,ud,start,r_cut);
+	test_crs_full<VerletList>(vd,vd2,eg,ud,start,r_cut);
 }
 
 BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list )
 {
-	test_csr_verlet_list();
+	test_csr_verlet_list<VERLET_MEMFAST(3,float)>();
+	test_csr_verlet_list<VERLET_MEMBAL(3,float)>();
+	test_csr_verlet_list<VERLET_MEMMW(3,float)>();
 }
 
 BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_dec_override )
 {
-	test_csr_verlet_list_override();
+	test_csr_verlet_list_override<VERLET_MEMFAST(3,float)>();
+	test_csr_verlet_list_override<VERLET_MEMBAL(3,float)>();
+	test_csr_verlet_list_override<VERLET_MEMMW(3,float)>();
 }
 
-BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_partit )
+template <typename VerletList>
+void test_vd_symmetric_crs_verlet()
 {
 	Vcluster & v_cl = create_vcluster();
 
@@ -1543,7 +1578,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_partit )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -1590,7 +1625,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_partit )
 
 	// We now try symmetric Verlet-list Crs scheme
 
-	auto NN2 = vd.getVerletCrs(r_cut);
+	auto NN2 = vd.template getVerletCrs<VerletList>(r_cut);
 
 	// Because iterating across particles in the CSR scheme require a Cell-list
 	auto p_it2 = vd.getParticleIteratorCRS_Cell(NN2.getInternalCellList());
@@ -1613,6 +1648,13 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_partit )
 	BOOST_REQUIRE_EQUAL(ret,true);
 }
 
+BOOST_AUTO_TEST_CASE( vector_dist_symmetric_crs_verlet_list_partit )
+{
+	test_vd_symmetric_crs_verlet<VERLET_MEMFAST(3,float)>();
+	test_vd_symmetric_crs_verlet<VERLET_MEMBAL(3,float)>();
+	test_vd_symmetric_crs_verlet<VERLET_MEMMW(3,float)>();
+}
+
 BOOST_AUTO_TEST_CASE( vector_dist_checking_unloaded_processors )
 {
 	Vcluster & v_cl = create_vcluster();
@@ -1633,7 +1675,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_checking_unloaded_processors )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list (unload processors) k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list (unload processors) k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list (unload processors) k=" << k );
 
 	Box<3,float> box({0,0,0},{L,L,L});
@@ -1722,7 +1764,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_list_multi_type )
 	long int big_step = k / 4;
 	big_step = (big_step == 0)?1:big_step;
 
-	print_test("Testing 3D periodic vector symmetric cell-list k=",k);
+	print_test_v("Testing 3D periodic vector symmetric cell-list k=",k);
 	BOOST_TEST_CHECKPOINT( "Testing 3D periodic vector symmetric cell-list k=" << k );
 
 	Box<3,float> box({-L,-L,-L},{L,L,L});
@@ -1812,4 +1854,3 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_list_multi_type )
 	BOOST_REQUIRE_EQUAL(ret,true);
 }
 
-#endif /* SRC_VECTOR_VECTOR_DIST_CELL_LIST_TESTS_HPP_ */
diff --git a/src/Vector/vector_dist_complex_prp_unit_test.hpp b/src/Vector/tests/vector_dist_complex_prp_unit_test.cpp
similarity index 96%
rename from src/Vector/vector_dist_complex_prp_unit_test.hpp
rename to src/Vector/tests/vector_dist_complex_prp_unit_test.cpp
index bc90cf2f4f55cdedcc169212feeb360b439be0fb..8d2956ba1f77fa9e2c2731480b892264094dedad 100644
--- a/src/Vector/vector_dist_complex_prp_unit_test.hpp
+++ b/src/Vector/tests/vector_dist_complex_prp_unit_test.cpp
@@ -4,10 +4,15 @@
  *  Created on: Sep 18, 2016
  *      Author: i-bird
  */
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
 
-#ifndef SRC_VECTOR_VECTOR_DIST_COMPLEX_PRP_UNIT_TEST_HPP_
-#define SRC_VECTOR_VECTOR_DIST_COMPLEX_PRP_UNIT_TEST_HPP_
+#include "Vector/vector_dist.hpp"
+#include "Vector/performance/vector_dist_performance_util.hpp"
+#include "vector_dist_util_unit_tests.hpp"
 
+extern void print_test_v(std::string test, size_t sz);
+extern long int decrement(long int k, long int step);
 
 BOOST_AUTO_TEST_CASE( vector_dist_periodic_complex_prp_test_use_3d )
 {
@@ -232,4 +237,3 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_complex_prp_test_use_3d )
 }
 
 
-#endif /* SRC_VECTOR_VECTOR_DIST_COMPLEX_PRP_UNIT_TEST_HPP_ */
diff --git a/src/Vector/vector_dist_unit_test.hpp b/src/Vector/tests/vector_dist_unit_test.cpp
similarity index 93%
rename from src/Vector/vector_dist_unit_test.hpp
rename to src/Vector/tests/vector_dist_unit_test.cpp
index 6c154e19025930bcc4ec187766115a0e6fdd7622..d5fc12cf011333cc439ce3cfd88e7b863a5ef8b7 100644
--- a/src/Vector/vector_dist_unit_test.hpp
+++ b/src/Vector/tests/vector_dist_unit_test.cpp
@@ -5,23 +5,59 @@
  *      Author: Pietro Incardona
  */
 
-#ifndef VECTOR_DIST_UNIT_TEST_HPP_
-#define VECTOR_DIST_UNIT_TEST_HPP_
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
 
 #include "config.h"
 
 #include <random>
 #include "Vector/vector_dist.hpp"
 #include "data_type/aggregate.hpp"
+#include "vector_dist_util_unit_tests.hpp"
+#include "Point_test.hpp"
 #include "Vector/performance/vector_dist_performance_common.hpp"
 
+/*! \brief Print a string about the test
+ *
+ * \param test string to print
+ * \param sz size
+ *
+ */
+void print_test_v(std::string test, size_t sz)
+{
+	if (create_vcluster().getProcessUnitID() == 0)
+		std::cout << test << " " << sz << "\n";
+}
+
+/*! \brief Get next testing step decrementing the size
+ *
+ * \param k actual size
+ * \param step
+ *
+ * \return the next step
+ *
+ */
+long int decrement(long int k, long int step)
+{
+	if (k <= 32)
+	{
+		return 1;
+	}
+	else if (k - 2*step+1 <= 0)
+	{
+		return k - 32;
+	}
+	else
+		return step;
+}
+
 /*! \brief Count the total number of particles
  *
  * \param vd distributed vector
  * \param bc boundary conditions
  *
  */
-template<unsigned int dim> size_t total_n_part_lc(vector_dist<dim,float, Point_test<float>, CartDecomposition<dim,float> > & vd, size_t (& bc)[dim])
+template<unsigned int dim, template <typename> class layout> size_t total_n_part_lc(vector_dist<dim,float, Point_test<float>,typename layout<Point_test<float>>::type, layout, CartDecomposition<dim,float> > & vd, size_t (& bc)[dim])
 {
 	Vcluster & v_cl = vd.getVC();
 	auto it2 = vd.getDomainIterator();
@@ -50,48 +86,6 @@ template<unsigned int dim> size_t total_n_part_lc(vector_dist<dim,float, Point_t
 	return cnt;
 }
 
-/*! \brief Count local and non local
- *
- * \param vd distributed vector
- * \param it iterator
- * \param bc boundary conditions
- * \param box domain box
- * \param dom_ext domain + ghost box
- * \param l_cnt local particles counter
- * \param nl_cnt non local particles counter
- * \param n_out out of domain + ghost particles counter
- *
- */
-template<unsigned int dim,typename vector_dist> inline void count_local_n_local(vector_dist & vd, vector_dist_iterator & it, size_t (& bc)[dim] , Box<dim,float> & box, Box<dim,float> & dom_ext, size_t & l_cnt, size_t & nl_cnt, size_t & n_out)
-{
-	const CartDecomposition<dim,float> & ct = vd.getDecomposition();
-
-	while (it.isNext())
-	{
-		auto key = it.get();
-		// Check if it is in the domain
-		if (box.isInsideNP(vd.getPos(key)) == true)
-		{
-			// Check if local
-			if (ct.isLocalBC(vd.getPos(key),bc) == true)
-				l_cnt++;
-			else
-				nl_cnt++;
-		}
-		else
-		{
-			nl_cnt++;
-		}
-
-		Point<dim,float> xp = vd.getPos(key);
-
-		// Check that all particles are inside the Domain + Ghost part
-		if (dom_ext.isInside(xp) == false)
-				n_out++;
-
-		++it;
-	}
-}
 
 BOOST_AUTO_TEST_SUITE( vector_dist_test )
 
@@ -101,6 +95,7 @@ void print_test(std::string test, size_t sz)
 		std::cout << test << " " << sz << "\n";
 }
 
+template<typename vector>
 void Test2D_ghost(Box<2,float> & box)
 {
 	// Communication object
@@ -150,7 +145,7 @@ void Test2D_ghost(Box<2,float> & box)
 	size_t bc[2]={NON_PERIODIC,NON_PERIODIC};
 
 	// Vector of particles
-	vector_dist<2,float, Point_test<float> > vd(g_info.size(),box,bc,g);
+	vector vd(g_info.size(),box,bc,g);
 
 	// size_t
 	size_t cobj = 0;
@@ -194,16 +189,16 @@ void Test2D_ghost(Box<2,float> & box)
 		auto key = v_it2.get();
 
 		// fill with the processor ID where these particle live
-		vd.getProp<p::s>(key) = vd.getPos(key)[0] + vd.getPos(key)[1] * 16.0f;
-		vd.getProp<p::v>(key)[0] = v_cl.getProcessUnitID();
-		vd.getProp<p::v>(key)[1] = v_cl.getProcessUnitID();
-		vd.getProp<p::v>(key)[2] = v_cl.getProcessUnitID();
+		vd.template getProp<p::s>(key) = vd.getPos(key)[0] + vd.getPos(key)[1] * 16.0f;
+		vd.template getProp<p::v>(key)[0] = v_cl.getProcessUnitID();
+		vd.template getProp<p::v>(key)[1] = v_cl.getProcessUnitID();
+		vd.template getProp<p::v>(key)[2] = v_cl.getProcessUnitID();
 
 		++v_it2;
 	}
 
 	// do a ghost get
-	vd.ghost_get<p::s,p::v>();
+	vd.template ghost_get<p::s,p::v>();
 
 	//! [Redistribute the particles and sync the ghost properties]
 
@@ -224,7 +219,7 @@ void Test2D_ghost(Box<2,float> & box)
 		auto key = g_it.get();
 
 		// Check the received data
-		BOOST_REQUIRE_EQUAL(vd.getPos(key)[0] + vd.getPos(key)[1] * 16.0f,vd.getProp<p::s>(key));
+		BOOST_REQUIRE_EQUAL(vd.getPos(key)[0] + vd.getPos(key)[1] * 16.0f,vd.template getProp<p::s>(key));
 
 		bool is_in = false;
 		size_t b = 0;
@@ -247,7 +242,7 @@ void Test2D_ghost(Box<2,float> & box)
 		BOOST_REQUIRE_EQUAL(is_in,true);
 
 		// Check that the particle come from the correct processor
-		BOOST_REQUIRE_EQUAL(vd.getProp<p::v>(key)[0],dec.getEGhostBoxProcessor(lb));
+		BOOST_REQUIRE_EQUAL(vd.template getProp<p::v>(key)[0],dec.getEGhostBoxProcessor(lb));
 
 		n_part++;
 		++g_it;
@@ -271,33 +266,28 @@ void Test2D_ghost(Box<2,float> & box)
 
 BOOST_AUTO_TEST_CASE( vector_dist_ghost )
 {
+	typedef vector_dist<2,float, Point_test<float>> vector;
+
 	Box<2,float> box({0.0,0.0},{1.0,1.0});
-	Test2D_ghost(box);
+	Test2D_ghost<vector>(box);
 
 	Box<2,float> box2({-1.0,-1.0},{2.5,2.5});
-	Test2D_ghost(box2);
+	Test2D_ghost<vector>(box2);
 }
 
-void print_test_v(std::string test, size_t sz)
+BOOST_AUTO_TEST_CASE( vector_dist_ghost_inte )
 {
-	if (create_vcluster().getProcessUnitID() == 0)
-		std::cout << test << " " << sz << "\n";
-}
+	typedef vector_dist<2,float, Point_test<float>,memory_traits_inte<Point_test<float>>::type,memory_traits_inte> vector;
 
-long int decrement(long int k, long int step)
-{
-	if (k <= 32)
-	{
-		return 1;
-	}
-	else if (k - 2*step+1 <= 0)
-	{
-		return k - 32;
-	}
-	else
-		return step;
+	Box<2,float> box({0.0,0.0},{1.0,1.0});
+	Test2D_ghost<vector>(box);
+
+	Box<2,float> box2({-1.0,-1.0},{2.5,2.5});
+	Test2D_ghost<vector>(box2);
 }
 
+
+
 BOOST_AUTO_TEST_CASE( vector_dist_iterator_test_use_2d )
 {
 	Vcluster & v_cl = create_vcluster();
@@ -741,7 +731,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_use_3d )
 	}
 }
 
-BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_random_walk )
+void test_random_walk(size_t opt)
 {
 	Vcluster & v_cl = create_vcluster();
 
@@ -775,7 +765,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_random_walk )
 		Ghost<3,float> ghost(0.01 / factor);
 
 		// Distributed vector
-		vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(k,box,bc,ghost);
+		vector_dist<3,float, Point_test<float> > vd(k,box,bc,ghost);
 
 		auto it = vd.getIterator();
 
@@ -809,7 +799,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_random_walk )
 				++it;
 			}
 
-			vd.map();
+			vd.map(opt);
 
 			vd.ghost_get<0>();
 
@@ -821,6 +811,16 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_random_walk )
 	}
 }
 
+BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_random_walk )
+{
+	test_random_walk(NONE);
+}
+
+BOOST_AUTO_TEST_CASE( vector_dist_periodic_test_random_walk_local_map )
+{
+	test_random_walk(MAP_LOCAL);
+}
+
 BOOST_AUTO_TEST_CASE( vector_dist_periodic_map )
 {
 	Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
@@ -835,7 +835,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_periodic_map )
 	Ghost<3,float> ghost(0.05 / factor);
 
 	// Distributed vector
-	vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(1,box,bc,ghost);
+	vector_dist<3,float, Point_test<float> > vd(1,box,bc,ghost);
 
 	// put particles al 1.0, check that they go to 0.0
 
@@ -886,7 +886,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_not_periodic_map )
 	Ghost<3,float> ghost(0.05 / factor);
 
 	// Distributed vector
-	vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(1,box,bc,ghost);
+	vector_dist<3,float, Point_test<float> > vd(1,box,bc,ghost);
 
 	// put particles al 1.0, check that they go to 0.0
 
@@ -941,7 +941,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_out_of_bound_policy )
 	Ghost<3,float> ghost(0.05 / factor);
 
 	// Distributed vector
-	vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(100,box,bc,ghost);
+	vector_dist<3,float, Point_test<float> > vd(100,box,bc,ghost);
 
 	// put particles at out of the boundary, they must be detected and and killed
 
@@ -1025,7 +1025,7 @@ void Test_interacting(Box<3,float> & box)
 		Ghost<3,float> ghost(r_cut);
 
 		// Distributed vector
-		vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(k,box,bc,ghost);
+		vector_dist<3,float, Point_test<float> > vd(k,box,bc,ghost);
 
 		auto it = vd.getIterator();
 
@@ -1165,7 +1165,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_grid_iterator )
 		Ghost<3,float> ghost(1.0/(Ng-2));
 
 		// Distributed vector
-		vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(0,box,bc,ghost);
+		vector_dist<3,float, Point_test<float> > vd(0,box,bc,ghost);
 
 		// Put particles on a grid creating a Grid iterator
 		auto it = vd.getGridIterator(sz);
@@ -1239,7 +1239,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_verlet_test )
 		Ghost<3,float> ghost(third_dist*1.1);
 
 		// Distributed vector
-		vector_dist<3,float, Point_test<float>, CartDecomposition<3,float> > vd(0,box,bc,ghost);
+		vector_dist<3,float, Point_test<float> > vd(0,box,bc,ghost);
 
 		// Put particles on a grid creating a Grid iterator
 		auto it = vd.getGridIterator(sz);
@@ -1284,7 +1284,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_cell_verlet_test )
 
 		// Create a verlet list for each particle
 
-		VerletList<3,float,FAST,shift<3,float>> verlet = vd.getVerlet(third_dist);
+		VerletList<3,float,Mem_fast<>,shift<3,float>> verlet = vd.getVerlet(third_dist);
 
 		bool correct = true;
 
@@ -1868,10 +1868,6 @@ BOOST_AUTO_TEST_CASE( vector_of_vector_dist )
 }
 
 
-#include "vector_dist_cell_list_tests.hpp"
-#include "vector_dist_NN_tests.hpp"
-#include "vector_dist_complex_prp_unit_test.hpp"
 
 BOOST_AUTO_TEST_SUITE_END()
 
-#endif /* VECTOR_DIST_UNIT_TEST_HPP_ */
diff --git a/src/Vector/tests/vector_dist_util_unit_tests.hpp b/src/Vector/tests/vector_dist_util_unit_tests.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f830e13fc558c0ef4da49e80995079c3c783f730
--- /dev/null
+++ b/src/Vector/tests/vector_dist_util_unit_tests.hpp
@@ -0,0 +1,57 @@
+/*
+ * vector_dist_util_unit_tests.hpp
+ *
+ *  Created on: Feb 14, 2018
+ *      Author: i-bird
+ */
+
+#ifndef SRC_VECTOR_TESTS_VECTOR_DIST_UTIL_UNIT_TESTS_HPP_
+#define SRC_VECTOR_TESTS_VECTOR_DIST_UTIL_UNIT_TESTS_HPP_
+
+
+/*! \brief Count local and non local
+ *
+ * \param vd distributed vector
+ * \param it iterator
+ * \param bc boundary conditions
+ * \param box domain box
+ * \param dom_ext domain + ghost box
+ * \param l_cnt local particles counter
+ * \param nl_cnt non local particles counter
+ * \param n_out out of domain + ghost particles counter
+ *
+ */
+template<unsigned int dim,typename vector_dist> inline void count_local_n_local(vector_dist & vd, vector_dist_iterator & it, size_t (& bc)[dim] , Box<dim,float> & box, Box<dim,float> & dom_ext, size_t & l_cnt, size_t & nl_cnt, size_t & n_out)
+{
+	const CartDecomposition<dim,float> & ct = vd.getDecomposition();
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+		// Check if it is in the domain
+		if (box.isInsideNP(vd.getPos(key)) == true)
+		{
+			// Check if local
+			if (ct.isLocalBC(vd.getPos(key),bc) == true)
+				l_cnt++;
+			else
+				nl_cnt++;
+		}
+		else
+		{
+			nl_cnt++;
+		}
+
+		Point<dim,float> xp = vd.getPos(key);
+
+		// Check that all particles are inside the Domain + Ghost part
+		if (dom_ext.isInside(xp) == false)
+				n_out++;
+
+		++it;
+	}
+}
+
+
+
+#endif /* SRC_VECTOR_TESTS_VECTOR_DIST_UTIL_UNIT_TESTS_HPP_ */
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 77e6c99dbea89183de465bd4cd5fe94c7a366962..a96fa1628bd671a21ef12c77655bf80ec4399e6d 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -35,6 +35,8 @@
 #include "NN/CellList/ParticleIt_Cells.hpp"
 #include "NN/CellList/ProcKeys.hpp"
 
+#define DEC_GRAN(gr) ((size_t)gr << 32)
+
 #define VECTOR_DIST_ERROR_OBJECT std::runtime_error("Runtime vector distributed error");
 
 #ifdef SE_CLASS3
@@ -51,8 +53,6 @@
 #define NO_ID false
 #define ID true
 
-#define DEC_GRAN(gr) ((size_t)gr << 32)
-
 // Perform a ghost get or a ghost put
 #define GET	1
 #define PUT 2
@@ -61,8 +61,13 @@
 #define NO_GHOST 0
 #define WITH_GHOST 2
 
+#define GCL_NON_SYMMETRIC 0
+#define GCL_SYMMETRIC 1
+#define GCL_HILBERT 2
+
+
 //! General function t get a cell-list
-template<unsigned int dim, typename St, typename CellL, typename Vector>
+template<unsigned int dim, typename St, typename CellL, typename Vector, unsigned int impl>
 struct gcl
 {
 	/*! \brief Get the Cell list based on the type
@@ -81,8 +86,8 @@ struct gcl
 };
 
 //! General function t get a cell-list
-template<unsigned int dim, typename St, typename Vector, typename Mem_type>
-struct gcl<dim,St,CellList_gen<dim, St, Process_keys_hilb,Mem_type, shift<dim, St> >,Vector>
+template<unsigned int dim, typename St, typename CellL, typename Vector>
+struct gcl<dim,St,CellL,Vector,GCL_HILBERT>
 {
 	/*! \brief Get the Cell list based on the type
 	 *
@@ -93,20 +98,53 @@ struct gcl<dim,St,CellList_gen<dim, St, Process_keys_hilb,Mem_type, shift<dim, S
 	 * \return the constructed cell-list
 	 *
 	 */
-	static inline CellList_gen<dim, St, Process_keys_hilb, Mem_type, shift<dim, St> > get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g)
+	static inline CellL get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g)
 	{
 		return vd.getCellList_hilb(r_cut,g);
 	}
 };
 
-#define CELL_MEMFAST(dim,St) CellList_gen<dim, St, Process_keys_lin, Mem_fast, shift<dim, St> >
-#define CELL_MEMBAL(dim,St) CellList_gen<dim, St, Process_keys_lin, Mem_bal, shift<dim, St> >
-#define CELL_MEMMW(dim,St) CellList_gen<dim, St, Process_keys_lin, Mem_mw, shift<dim, St> >
+//! General function t get a cell-list
+template<unsigned int dim, typename St, typename CellL, typename Vector>
+struct gcl<dim,St,CellL,Vector,GCL_SYMMETRIC>
+{
+	/*! \brief Get the Cell list based on the type
+	 *
+	 * \param vd Distributed vector
+	 * \param r_cut Cut-off radius
+	 * \param g Ghost
+	 *
+	 * \return the constructed cell-list
+	 *
+	 */
+	static inline CellL get(Vector & vd, const St & r_cut, const Ghost<dim,St> & g)
+	{
+		return vd.getCellListSym(r_cut);
+	}
+};
+
+#define CELL_MEMFAST(dim,St) CellList_gen<dim, St, Process_keys_lin, Mem_fast<>, shift<dim, St> >
+#define CELL_MEMBAL(dim,St) CellList_gen<dim, St, Process_keys_lin, Mem_bal<>, shift<dim, St> >
+#define CELL_MEMMW(dim,St) CellList_gen<dim, St, Process_keys_lin, Mem_mw<>, shift<dim, St> >
 
-#define CELL_MEMFAST_HILB(dim,St) CellList_gen<dim, St, Process_keys_hilb, Mem_fast, shift<dim, St> >
-#define CELL_MEMBAL_HILB(dim,St) CellList_gen<dim, St, Process_keys_hilb, Mem_bal, shift<dim, St> >
-#define CELL_MEMMW_HILB(dim,St) CellList_gen<dim, St, Process_keys_hilb, Mem_mw, shift<dim, St> >
+#define CELL_MEMFAST_HILB(dim,St) CellList_gen<dim, St, Process_keys_hilb, Mem_fast<>, shift<dim, St> >
+#define CELL_MEMBAL_HILB(dim,St) CellList_gen<dim, St, Process_keys_hilb, Mem_bal<>, shift<dim, St> >
+#define CELL_MEMMW_HILB(dim,St) CellList_gen<dim, St, Process_keys_hilb, Mem_mw<>, shift<dim, St> >
 
+#define VERLET_MEMFAST(dim,St) VerletList<dim,St,Mem_fast<>,shift<dim,St> >
+#define VERLET_MEMBAL(dim,St)  VerletList<dim,St,Mem_bal<>,shift<dim,St> >
+#define VERLET_MEMMW(dim,St)   VerletList<dim,St,Mem_mw<>,shift<dim,St> >
+
+#define VERLET_MEMFAST_INT(dim,St) VerletList<dim,St,Mem_fast<unsigned int>,shift<dim,St> >
+#define VERLET_MEMBAL_INT(dim,St)  VerletList<dim,St,Mem_bal<unsigned int>,shift<dim,St> >
+#define VERLET_MEMMW_INT(dim,St)   VerletList<dim,St,Mem_mw<unsigned int>,shift<dim,St> >
+
+enum reorder_opt
+{
+	NO_REORDER = 0,
+	HILBERT = 1,
+	LINEAR = 2
+};
 
 /*! \brief Distributed vector
  *
@@ -133,13 +171,19 @@ struct gcl<dim,St,CellList_gen<dim, St, Process_keys_hilb,Mem_type, shift<dim, S
  *
  */
 
-template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St>, typename Memory = HeapMemory>
-class vector_dist : public vector_dist_comm<dim,St,prop,Decomposition,Memory>
+template<unsigned int dim,
+         typename St,
+		 typename prop,
+		 typename layout = typename memory_traits_lin<prop>::type,
+		 template <typename> class layout_base = memory_traits_lin,
+		 typename Decomposition = CartDecomposition<dim,St>,
+		 typename Memory = HeapMemory>
+class vector_dist : public vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory>
 {
 public:
 
 	//! Self type
-	typedef vector_dist<dim,St,prop,Decomposition,Memory> self;
+	typedef vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> self;
 
 	//! property object
 	typedef prop value_type;
@@ -151,11 +195,11 @@ private:
 
 	//! Particle position vector, (It has 2 elements) the first has real particles assigned to a processor
 	//! the second element contain unassigned particles
-	openfpm::vector<Point<dim, St>> v_pos;
+	openfpm::vector<Point<dim, St>,Memory> v_pos;
 
 	//! Particle properties vector, (It has 2 elements) the first has real particles assigned to a processor
 	//! the second element contain unassigned particles
-	openfpm::vector<prop> v_prp;
+	openfpm::vector<prop,Memory,layout,layout_base> v_prp;
 
 	//! Virtual cluster
 	Vcluster & v_cl;
@@ -231,6 +275,51 @@ private:
 		}
 	}
 
+	/*! \brief Reorder based on hilbert space filling curve
+	 *
+	 * \param v_pos_dest reordered vector of position
+	 * \param v_prp_dest reordered vector of properties
+	 * \param m order of the space filling curve
+	 * \param cell_list cell-list
+	 *
+	 */
+	template<typename CellL, typename sfc_it>
+	void reorder_sfc(openfpm::vector<Point<dim,St>> & v_pos_dest,
+						 openfpm::vector<prop> & v_prp_dest,
+						 sfc_it & h_it,
+						 CellL & cell_list)
+	{
+		v_pos_dest.resize(v_pos.size());
+		v_prp_dest.resize(v_prp.size());
+
+		//Index for v_pos_dest
+		size_t count = 0;
+
+		grid_key_dx<dim> ksum;
+
+		for (size_t i = 0; i < dim ; i++)
+		{ksum.set_d(i,cell_list.getPadding(i));}
+
+		while (h_it.isNext())
+		{
+		  auto key = h_it.get();
+		  key += ksum;
+
+		  size_t lin = cell_list.getGrid().LinId(key);
+
+		  // for each particle in the Cell "lin"
+		  for (size_t i = 0; i < cell_list.getNelements(lin); i++)
+		  {
+			  //reorder
+			  auto v = cell_list.get(lin,i);
+			  v_pos_dest.get(count) = v_pos.get(v);
+			  v_prp_dest.get(count) = v_prp.get(v);
+
+			  count++;
+		  }
+		  ++h_it;
+		}
+	}
 
 public:
 
@@ -247,9 +336,9 @@ public:
 	 * \return itself
 	 *
 	 */
-	vector_dist<dim,St,prop,Decomposition,Memory> & operator=(const vector_dist<dim,St,prop,Decomposition,Memory> & v)
+	vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> & operator=(const vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> & v)
 	{
-		static_cast<vector_dist_comm<dim,St,prop,Decomposition,Memory> *>(this)->operator=(static_cast<vector_dist_comm<dim,St,prop,Decomposition,Memory>>(v));
+		static_cast<vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> *>(this)->operator=(static_cast<vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory>>(v));
 
 		g_m = v.g_m;
 		v_pos = v.v_pos;
@@ -271,9 +360,9 @@ public:
 	 * \return itself
 	 *
 	 */
-	vector_dist<dim,St,prop,Decomposition,Memory> & operator=(vector_dist<dim,St,prop,Decomposition,Memory> && v)
+	vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> & operator=(vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> && v)
 	{
-		static_cast<vector_dist_comm<dim,St,prop,Decomposition,Memory> *>(this)->operator=(static_cast<vector_dist_comm<dim,St,prop,Decomposition,Memory> >(v));
+		static_cast<vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> *>(this)->operator=(static_cast<vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> >(v));
 
 		g_m = v.g_m;
 		v_pos.swap(v.v_pos);
@@ -294,8 +383,8 @@ public:
 	 * \param v vector to copy
 	 *
 	 */
-	vector_dist(const vector_dist<dim,St,prop,Decomposition,Memory> & v)
-	:vector_dist_comm<dim,St,prop,Decomposition,Memory>(v.getDecomposition()),v_cl(v.v_cl) SE_CLASS3_VDIST_CONSTRUCTOR
+	vector_dist(const vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> & v)
+	:vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory>(v.getDecomposition()),v_cl(v.v_cl) SE_CLASS3_VDIST_CONSTRUCTOR
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,VECTOR_DIST_EVENT,4);
@@ -309,7 +398,7 @@ public:
 	 * \param v vector to copy
 	 *
 	 */
-	vector_dist(vector_dist<dim,St,prop,Decomposition,Memory> && v) noexcept
+	vector_dist(vector_dist<dim,St,prop,layout,layout_base,Decomposition,Memory> && v) noexcept
 	:v_cl(v.v_cl) SE_CLASS3_VDIST_CONSTRUCTOR
 	{
 #ifdef SE_CLASS2
@@ -330,7 +419,7 @@ public:
 	 *
 	 */
 	vector_dist(const Decomposition & dec, size_t np) :
-	vector_dist_comm<dim,St,prop,Decomposition,Memory>(dec), v_cl(create_vcluster()) SE_CLASS3_VDIST_CONSTRUCTOR
+	vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory>(dec), v_cl(create_vcluster()) SE_CLASS3_VDIST_CONSTRUCTOR
 	{
 #ifdef SE_CLASS2
 		check_new(this,8,VECTOR_DIST_EVENT,4);
@@ -874,7 +963,7 @@ public:
 	 * \return the Cell list
 	 *
 	 */
-	template<typename CellL = CellList<dim, St, Mem_fast, shift<dim, St> > > CellL getCellListSym(St r_cut)
+	template<typename CellL = CellList<dim, St, Mem_fast<>, shift<dim, St> > > CellL getCellListSym(St r_cut)
 	{
 #ifdef SE_CLASS1
 		if (!(opt & BIND_DEC_TO_GHOST))
@@ -918,7 +1007,7 @@ public:
 	 * \return the Cell list
 	 *
 	 */
-	template<typename CellL = CellList_gen<dim, St, Process_keys_lin, Mem_fast, shift<dim, St> > >
+	template<typename CellL = CellList_gen<dim, St, Process_keys_lin, Mem_fast<>, shift<dim, St> > >
 	CellL getCellList(St r_cut, bool no_se3 = false)
 	{
 #ifdef SE_CLASS3
@@ -945,7 +1034,7 @@ public:
 	 * \return the Cell list
 	 *
 	 */
-	template<typename CellL = CellList_gen<dim, St, Process_keys_hilb, Mem_fast, shift<dim, St> > >
+	template<typename CellL = CellList_gen<dim, St, Process_keys_hilb, Mem_fast<>, shift<dim, St> > >
 	CellL getCellList_hilb(St r_cut)
 	{
 #ifdef SE_CLASS3
@@ -995,7 +1084,7 @@ public:
 		}
 		else
 		{
-			CellL cli_tmp = gcl<dim,St,CellL,self>::get(*this,r_cut,getDecomposition().getGhost());
+			CellL cli_tmp = gcl<dim,St,CellL,self,GCL_NON_SYMMETRIC>::get(*this,r_cut,getDecomposition().getGhost());
 
 			cell_list.swap(cli_tmp);
 		}
@@ -1008,7 +1097,7 @@ public:
 	 * \param cell_list Cell list to update
 	 *
 	 */
-	template<typename CellL = CellList<dim, St, Mem_fast, shift<dim, St> > > void updateCellListSym(CellL & cell_list)
+	template<typename CellL = CellList<dim, St, Mem_fast<>, shift<dim, St> > > void updateCellListSym(CellL & cell_list)
 	{
 #ifdef SE_CLASS3
 		se3.getNN();
@@ -1032,7 +1121,7 @@ public:
 		}
 		else
 		{
-			CellL cli_tmp = gcl<dim,St,CellL,self>::get(*this,r_cut,getDecomposition().getGhost());
+			CellL cli_tmp = gcl<dim,St,CellL,self,GCL_SYMMETRIC>::get(*this,r_cut,getDecomposition().getGhost());
 
 			cell_list.swap(cli_tmp);
 		}
@@ -1054,7 +1143,7 @@ public:
 	 * \return the CellList
 	 *
 	 */
-	template<typename CellL = CellList_gen<dim, St, Process_keys_lin, Mem_fast, shift<dim, St> > >
+	template<typename CellL = CellList_gen<dim, St, Process_keys_lin, Mem_fast<>, shift<dim, St> > >
 	CellL getCellList(St r_cut, const Ghost<dim, St> & enlarge, bool no_se3 = false)
 	{
 #ifdef SE_CLASS3
@@ -1073,7 +1162,8 @@ public:
 		// Processor bounding box
 		cl_param_calculate(pbox, div, r_cut, enlarge);
 
-		cell_list.Initialize(pbox, div, g_m);
+		cell_list.Initialize(pbox, div);
+		cell_list.set_gm(g_m);
 		cell_list.set_ndec(getDecomposition().get_ndec());
 
 		updateCellList(cell_list,no_se3);
@@ -1096,7 +1186,7 @@ public:
 	 * \return The Cell-list
 	 *
 	 */
-	template<typename CellL = CellList_gen<dim, St, Process_keys_hilb, Mem_fast, shift<dim, St> > > CellL getCellList_hilb(St r_cut, const Ghost<dim, St> & enlarge)
+	template<typename CellL = CellList_gen<dim, St, Process_keys_hilb, Mem_fast<>, shift<dim, St> > > CellL getCellList_hilb(St r_cut, const Ghost<dim, St> & enlarge)
 	{
 #ifdef SE_CLASS3
 		se3.getNN();
@@ -1113,7 +1203,8 @@ public:
 		// Processor bounding box
 		cl_param_calculate(pbox,div, r_cut, enlarge);
 
-		cell_list.Initialize(pbox, div, g_m);
+		cell_list.Initialize(pbox, div);
+		cell_list.set_gm(g_m);
 		cell_list.set_ndec(getDecomposition().get_ndec());
 
 		updateCellList(cell_list);
@@ -1128,13 +1219,14 @@ public:
 	 * \return the verlet list
 	 *
 	 */
-	VerletList<dim,St,FAST,shift<dim,St> > getVerletSym(St r_cut)
+	template <typename VerletL = VerletList<dim,St,Mem_fast<>,shift<dim,St> >>
+	VerletL getVerletSym(St r_cut)
 	{
 #ifdef SE_CLASS3
 		se3.getNN();
 #endif
 
-		VerletList<dim,St,FAST,shift<dim,St>> ver;
+		VerletL ver;
 
 		// Processor bounding box
 		Box<dim, St> pbox = getDecomposition().getProcessorBounds();
@@ -1153,7 +1245,8 @@ public:
 	 * \return the verlet list
 	 *
 	 */
-	VerletList<dim,St,FAST,shift<dim,St> > getVerletCrs(St r_cut)
+	template <typename VerletL = VerletList<dim,St,Mem_fast<>,shift<dim,St> >>
+	VerletL getVerletCrs(St r_cut)
 	{
 #ifdef SE_CLASS1
 		if (!(opt & BIND_DEC_TO_GHOST))
@@ -1167,7 +1260,7 @@ public:
 		se3.getNN();
 #endif
 
-		VerletList<dim,St,FAST,shift<dim,St>> ver;
+		VerletL ver;
 
 		// Processor bounding box
 		Box<dim, St> pbox = getDecomposition().getProcessorBounds();
@@ -1205,13 +1298,14 @@ public:
 	 * \return a VerletList object
 	 *
 	 */
-	VerletList<dim,St,FAST,shift<dim,St> > getVerlet(St r_cut)
+	template <typename VerletL = VerletList<dim,St,Mem_fast<>,shift<dim,St> >>
+	VerletL getVerlet(St r_cut)
 	{
 #ifdef SE_CLASS3
 		se3.getNN();
 #endif
 
-		VerletList<dim,St,FAST,shift<dim,St>> ver;
+		VerletL ver;
 
 		// get the processor bounding box
 		Box<dim, St> bt = getDecomposition().getProcessorBounds();
@@ -1238,7 +1332,7 @@ public:
 	 * \param opt option like VL_SYMMETRIC and VL_NON_SYMMETRIC or VL_CRS_SYMMETRIC
 	 *
 	 */
-	void updateVerlet(VerletList<dim,St,FAST,shift<dim,St> > & ver, St r_cut, size_t opt = VL_NON_SYMMETRIC)
+	template<typename Mem_type> void updateVerlet(VerletList<dim,St,Mem_type,shift<dim,St> > & ver, St r_cut, size_t opt = VL_NON_SYMMETRIC)
 	{
 #ifdef SE_CLASS3
 		se3.getNN();
@@ -1256,9 +1350,9 @@ public:
 				ver.update(getDecomposition().getDomain(),r_cut,v_pos,g_m, opt);
 			else
 			{
-				VerletList<dim,St,FAST,shift<dim,St> > ver_tmp;
+				VerletList<dim,St,Mem_type,shift<dim,St> > ver_tmp;
 
-				ver_tmp = getVerlet(r_cut);
+				ver_tmp = getVerlet<VerletList<dim,St,Mem_type,shift<dim,St> >>(r_cut);
 				ver.swap(ver);
 			}
 		}
@@ -1297,9 +1391,9 @@ public:
 			}
 			else
 			{
-				VerletList<dim,St,FAST,shift<dim,St> > ver_tmp;
+				VerletList<dim,St,Mem_type,shift<dim,St> > ver_tmp;
 
-				ver_tmp = getVerletCrs(r_cut);
+				ver_tmp = getVerletCrs<VerletList<dim,St,Mem_type,shift<dim,St> >>(r_cut);
 				ver.swap(ver_tmp);
 			}
 		}
@@ -1315,9 +1409,9 @@ public:
 				ver.update(getDecomposition().getDomain(),r_cut,v_pos,g_m, opt);
 			else
 			{
-				VerletList<dim,St,FAST,shift<dim,St> > ver_tmp;
+				VerletList<dim,St,Mem_type,shift<dim,St> > ver_tmp;
 
-				ver_tmp = getVerlet(r_cut);
+				ver_tmp = getVerlet<VerletList<dim,St,Mem_type,shift<dim,St> >>(r_cut);
 				ver.swap(ver_tmp);
 			}
 		}
@@ -1331,9 +1425,10 @@ public:
 	 * \param m an order of a hilbert curve
 	 *
 	 */
-	template<typename CellL=CellList_gen<dim,St,Process_keys_lin,Mem_fast,shift<dim,St> > > void reorder (int32_t m)
+	template<typename CellL=CellList_gen<dim,St,Process_keys_lin,Mem_bal<>,shift<dim,St> > >
+	void reorder (int32_t m, reorder_opt opt = reorder_opt::HILBERT)
 	{
-		reorder(m,getDecomposition().getGhost());
+		reorder<CellL>(m,getDecomposition().getGhost(),opt);
 	}
 
 
@@ -1349,7 +1444,8 @@ public:
 	 * \param enlarge In case of padding particles the cell list must be enlarged, like a ghost this parameter say how much must be enlarged
 	 *
 	 */
-	template<typename CellL=CellList_gen<dim,St,Process_keys_lin,Mem_fast,shift<dim,St> > > void reorder(int32_t m, const Ghost<dim,St> & enlarge)
+	template<typename CellL=CellList_gen<dim,St,Process_keys_lin,Mem_bal<>,shift<dim,St> > >
+	void reorder(int32_t m, const Ghost<dim,St> & enlarge, reorder_opt opt = reorder_opt::HILBERT)
 	{
 		// reset the ghost part
 		v_pos.resize(g_m);
@@ -1373,7 +1469,8 @@ public:
 			div[i] = 1 << m;
 		}
 
-		cell_list.Initialize(pbox,div,g_m);
+		cell_list.Initialize(pbox,div);
+		cell_list.set_gm(g_m);
 
 		// for each particle add the particle to the cell list
 
@@ -1396,38 +1493,24 @@ public:
 		openfpm::vector<Point<dim,St>> v_pos_dest;
 		openfpm::vector<prop> v_prp_dest;
 
-		v_pos_dest.resize(v_pos.size());
-		v_prp_dest.resize(v_prp.size());
-
-		//hilberts curve iterator
-		grid_key_dx_iterator_hilbert<dim> h_it(m);
-
-		//Index for v_pos_dest
-		size_t count = 0;
-
-		grid_key_dx<dim> ksum;
-
-		for (size_t i = 0; i < dim ; i++)
-			ksum.set_d(i,cell_list.getPadding(i));
-
-		while (h_it.isNext())
+		if (opt == reorder_opt::HILBERT)
 		{
-		  auto key = h_it.get();
-		  key += ksum;
+			grid_key_dx_iterator_hilbert<dim> h_it(m);
 
-		  size_t lin = cell_list.getGrid().LinId(key);
-
-		  // for each particle in the Cell "lin"
-		  for (size_t i = 0; i < cell_list.getNelements(lin); i++)
-		  {
-			  //reorder
-			  auto v = cell_list.get(lin,i);
-			  v_pos_dest.get(count) = v_pos.get(v);
-			  v_prp_dest.get(count) = v_prp.get(v);
+			reorder_sfc<CellL,grid_key_dx_iterator_hilbert<dim>>(v_pos_dest,v_prp_dest,h_it,cell_list);
+		}
+		else if (opt == reorder_opt::LINEAR)
+		{
+			grid_sm<dim,void> gs(div);
+			grid_key_dx_iterator<dim> h_it(gs);
 
-			  count++;
-		  }
-		  ++h_it;
+			reorder_sfc<CellL,grid_key_dx_iterator<dim>>(v_pos_dest,v_prp_dest,h_it,cell_list);
+		}
+		else
+		{
+			// We do nothing, we second swap nullify the first
+			v_pos.swap(v_pos_dest);
+			v_prp.swap(v_prp_dest);
 		}
 
 		v_pos.swap(v_pos_dest);
@@ -1614,7 +1697,7 @@ public:
 	 */
 	inline Decomposition & getDecomposition()
 	{
-		return vector_dist_comm<dim,St,prop,Decomposition,Memory>::getDecomposition();
+		return vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory>::getDecomposition();
 	}
 
 	/*! \brief Get the decomposition
@@ -1624,7 +1707,7 @@ public:
 	 */
 	inline const Decomposition & getDecomposition() const
 	{
-		return vector_dist_comm<dim,St,prop,Decomposition,Memory>::getDecomposition();
+		return vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory>::getDecomposition();
 	}
 
 	/*! \brief It move all the particles that does not belong to the local processor to the respective processor
@@ -1639,9 +1722,17 @@ public:
 	 *
 	 *
 	 */
-	template<unsigned int ... prp> void map_list()
+	template<unsigned int ... prp> void map_list(size_t opt = NONE)
 	{
-		this->template map_list_<prp...>(v_pos,v_prp,g_m);
+#ifdef SE_CLASS3
+		se3.map_pre();
+#endif
+
+		this->template map_list_<prp...>(v_pos,v_prp,g_m,opt);
+
+#ifdef SE_CLASS3
+		se3.map_post();
+#endif
 	}
 
 
@@ -1655,13 +1746,13 @@ public:
 	 *
 	 *
 	 */
-	template<typename obp = KillParticle> void map()
+	template<typename obp = KillParticle> void map(size_t opt = NONE)
 	{
 #ifdef SE_CLASS3
 		se3.map_pre();
 #endif
 
-		this->template map_<obp>(v_pos,v_prp,g_m);
+		this->template map_<obp>(v_pos,v_prp,g_m,opt);
 
 #ifdef SE_CLASS3
 		se3.map_post();
@@ -2134,7 +2225,7 @@ public:
 	 * \return Particle iterator
 	 *
 	 */
-	template<typename vrl> openfpm::vector_key_iterator_seq<typename vrl::local_index_t> getParticleIteratorCRS(vrl & NN)
+	template<typename vrl> openfpm::vector_key_iterator_seq<typename vrl::Mem_type_type::loc_index> getParticleIteratorCRS(vrl & NN)
 	{
 #ifdef SE_CLASS1
 		if (!(opt & BIND_DEC_TO_GHOST))
@@ -2145,7 +2236,7 @@ public:
 #endif
 
 		// First we check that
-		return openfpm::vector_key_iterator_seq<typename vrl::local_index_t>(NN.getParticleSeq());
+		return openfpm::vector_key_iterator_seq<typename vrl::Mem_type_type::loc_index>(NN.getParticleSeq());
 	}
 
 	/*! \brief Return from which cell we have to start in case of CRS interation
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index 758f51c070f79820ee183fc4cc42f760f0df9d34..1be8d771c5d5ffba3f68e90592f209ce18f1ea1b 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -17,6 +17,8 @@
 
 #define BIND_DEC_TO_GHOST 1
 
+#define MAP_LOCAL 2
+
 /*! \brief compute the communication options from the ghost_get/put options
  *
  *
@@ -42,7 +44,13 @@ inline static size_t compute_options(size_t opt)
  *
  */
 
-template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St>, typename Memory = HeapMemory>
+template<unsigned int dim,
+         typename St,
+		 typename prop,
+		 typename layout,
+		 template <typename> class layout_base,
+		 typename Decomposition = CartDecomposition<dim,St>,
+		 typename Memory = HeapMemory>
 class vector_dist_comm
 {
 	//! Number of units for each sub-domain
@@ -262,7 +270,8 @@ class vector_dist_comm
 	 * \param v_prp vector of particles properties
 	 *
 	 */
-	void local_ghost_from_opart(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp)
+	void local_ghost_from_opart(openfpm::vector<Point<dim, St>> & v_pos,
+			                    openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp)
 	{
 		// get the shift vectors
 		const openfpm::vector<Point<dim, St>> & shifts = dec.getShiftVectors();
@@ -289,7 +298,9 @@ class vector_dist_comm
 	 * \param g_m ghost marker
 	 *
 	 */
-	void local_ghost_from_dec(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t g_m)
+	void local_ghost_from_dec(openfpm::vector<Point<dim, St>> & v_pos,
+			                  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+							  size_t g_m)
 	{
 		o_part_loc.clear();
 
@@ -310,7 +321,7 @@ class vector_dist_comm
 				{
 					if (box_f.get(i).get(j).isInside(v_pos.get(key)) == true)
 					{
-						size_t lin_id = box_cmb.get(i).lin();
+						size_t lin_id = dec.convertShift(box_cmb.get(i));
 
 						o_part_loc.add();
 						o_part_loc.template get<0>(o_part_loc.size()-1) = key;
@@ -390,7 +401,10 @@ class vector_dist_comm
 	 * \param opt options
 	 *
 	 */
-	void add_loc_particles_bc(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp ,size_t & g_m, size_t opt)
+	void add_loc_particles_bc(openfpm::vector<Point<dim, St>> & v_pos,
+			                  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp ,
+							  size_t & g_m,
+							  size_t opt)
 	{
 		// Create the shift boxes
 		createShiftBox();
@@ -403,9 +417,9 @@ class vector_dist_comm
 		else
 		{
 			if (opt & SKIP_LABELLING)
-				local_ghost_from_opart(v_pos,v_prp);
+			{local_ghost_from_opart(v_pos,v_prp);}
 			else
-				local_ghost_from_dec(v_pos,v_prp,g_m);
+			{local_ghost_from_dec(v_pos,v_prp,g_m);}
 		}
 	}
 
@@ -430,7 +444,7 @@ class vector_dist_comm
 			// Buffer must retained and survive the destruction of the
 			// vector
 			if (hsmem.get(i).ref() == 0)
-				hsmem.get(i).incRef();
+			{hsmem.get(i).incRef();}
 
 			// Set the memory for retain the send buffer
 			g_pos_send.get(i).setMemory(hsmem.get(i));
@@ -532,7 +546,9 @@ class vector_dist_comm
 	 * \param g_send_prp Send buffer to fill
 	 *
 	 */
-	template<typename send_vector, typename prp_object, int ... prp> void fill_send_ghost_prp_buf(openfpm::vector<prop> & v_prp, openfpm::vector<send_vector> & g_send_prp)
+	template<typename send_vector, typename prp_object, int ... prp>
+	void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+			                     openfpm::vector<send_vector> & g_send_prp)
 	{
 		// create a number of send buffers equal to the near processors
 		g_send_prp.resize(g_opart.size());
@@ -559,7 +575,7 @@ class vector_dist_comm
 			for (size_t j = 0; j < g_opart.get(i).size(); j++)
 			{
 				// source object type
-				typedef encapc<1, prop, typename openfpm::vector<prop>::layout_type> encap_src;
+				typedef encapc<1, prop, typename openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>::layout_type> encap_src;
 				// destination object type
 				typedef encapc<1, prp_object, typename openfpm::vector<prp_object>::layout_type> encap_dst;
 
@@ -578,7 +594,11 @@ class vector_dist_comm
 	 * \param m_prp sending buffer for properties
 	 *
 	 */
-	void fill_send_map_buf(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, openfpm::vector<size_t> & prc_sz_r, openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos, openfpm::vector<openfpm::vector<prop>> & m_prp)
+	void fill_send_map_buf(openfpm::vector<Point<dim, St>> & v_pos,
+			               openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+						   openfpm::vector<size_t> & prc_sz_r,
+						   openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos,
+						   openfpm::vector<openfpm::vector<prop>> & m_prp)
 	{
 		m_prp.resize(prc_sz_r.size());
 		m_pos.resize(prc_sz_r.size());
@@ -621,7 +641,12 @@ class vector_dist_comm
 	 * \param m_prp sending buffer for properties
 	 *
 	 */
-	template<typename prp_object,int ... prp> void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, openfpm::vector<size_t> & prc_sz_r, openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos, openfpm::vector<openfpm::vector<prp_object>> & m_prp)
+	template<typename prp_object,int ... prp>
+	void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos,
+			                    openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+								openfpm::vector<size_t> & prc_sz_r,
+								openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos,
+								openfpm::vector<openfpm::vector<prp_object>> & m_prp)
 	{
 		m_prp.resize(prc_sz_r.size());
 		m_pos.resize(prc_sz_r.size());
@@ -658,7 +683,10 @@ class vector_dist_comm
 	 * \param prc_sz For each processor the number of particles to send
 	 *
 	 */
-	template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<aggregate<size_t,size_t,size_t>> & lbl_p, openfpm::vector<size_t> & prc_sz)
+	template<typename obp>
+	void labelParticleProcessor(openfpm::vector<Point<dim, St>> & v_pos,
+								openfpm::vector<aggregate<size_t,size_t,size_t>> & lbl_p,
+								openfpm::vector<size_t> & prc_sz)
 	{
 		// reset lbl_p
 		lbl_p.clear();
@@ -680,9 +708,9 @@ class vector_dist_comm
 
 			// Check if the particle is inside the domain
 			if (dec.getDomain().isInside(v_pos.get(key)) == true)
-				p_id = dec.processorIDBC(v_pos.get(key));
+			{p_id = dec.processorIDBC(v_pos.get(key));}
 			else
-				p_id = obp::out(key, v_cl.getProcessUnitID());
+			{p_id = obp::out(key, v_cl.getProcessUnitID());}
 
 			// Particle to move
 			if (p_id != v_cl.getProcessUnitID())
@@ -720,7 +748,10 @@ class vector_dist_comm
 	 * \param g_m ghost marker
 	 *
 	 */
-	void labelParticlesGhost(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, openfpm::vector<size_t> & prc, size_t & g_m)
+	void labelParticlesGhost(openfpm::vector<Point<dim, St>> & v_pos,
+			                 openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+							 openfpm::vector<size_t> & prc,
+							 size_t & g_m)
 	{
 		// Buffer that contain for each processor the id of the particle to send
 		g_opart.clear();
@@ -784,7 +815,7 @@ class vector_dist_comm
 	static void * message_alloc_map(size_t msg_i, size_t total_msg, size_t total_p, size_t i, size_t ri, void * ptr)
 	{
 		// cast the pointer
-		vector_dist_comm<dim, St, prop, Decomposition, Memory> * vd = static_cast<vector_dist_comm<dim, St, prop, Decomposition, Memory> *>(ptr);
+		vector_dist_comm<dim, St, prop,layout,layout_base, Decomposition, Memory> * vd = static_cast<vector_dist_comm<dim, St, prop, layout, layout_base, Decomposition, Memory> *>(ptr);
 
 		vd->recv_mem_gm.resize(vd->v_cl.getProcessingUnits());
 		vd->recv_mem_gm.get(i).resize(msg_i);
@@ -799,7 +830,7 @@ public:
 	 * \param v vector to copy
 	 *
 	 */
-	vector_dist_comm(const vector_dist_comm<dim,St,prop,Decomposition,Memory> & v)
+	vector_dist_comm(const vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> & v)
 	:v_cl(create_vcluster()),dec(create_vcluster()),lg_m(0)
 	{
 		this->operator=(v);
@@ -901,27 +932,15 @@ public:
 			cl_param_calculateSym<dim,St>(box,cd_sm,g,pad);
 
 			for (size_t i = 0 ; i < dim ; i++)
-				div[i] = cd_sm.getDiv()[i] - 2*pad;
+			{div[i] = cd_sm.getDiv()[i] - 2*pad;}
+
+			// Create the sub-domains
+			dec.setParameters(div, box, bc, g, gdist);
 		}
 		else
 		{
-			// Create a valid decomposition of the space
-			// Get the number of processor and calculate the number of sub-domain
-			// for decomposition
-			size_t n_proc = v_cl.getProcessingUnits();
-			size_t n_sub = n_proc * getDecompositionGranularity();
-
-			// Calculate the maximum number (before merging) of sub-domain on
-			// each dimension
-
-			for (size_t i = 0; i < dim; i++)
-			{
-				div[i] = openfpm::math::round_big_2(pow(n_sub, 1.0 / dim));
-			}
+			dec.setGoodParameters(box, bc, g, getDecompositionGranularity(), gdist);
 		}
-
-		// Create the sub-domains
-		dec.setParameters(div, box, bc, g, gdist);
 		dec.decompose();
 	}
 
@@ -935,7 +954,11 @@ public:
 	 * \param g_m marker between real and ghost particles
 	 *
 	 */
-	template<int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt = WITH_POSITION)
+	template<int ... prp> inline
+	void ghost_get_(openfpm::vector<Point<dim, St>> & v_pos,
+			        openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+					size_t & g_m,
+					size_t opt = WITH_POSITION)
 	{
 		// Sending property object
 		typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
@@ -953,7 +976,7 @@ public:
 
 		// Label all the particles
 		if ((opt & SKIP_LABELLING) == false)
-			labelParticlesGhost(v_pos,v_prp,prc_g_opart,g_m);
+		{labelParticlesGhost(v_pos,v_prp,prc_g_opart,g_m);}
 
 		// Send and receive ghost particle information
 		{
@@ -969,10 +992,10 @@ public:
                 {
                 	size_t opt_ = compute_options(opt);
                 	op_ssend_gg_recv_merge opm(g_m);
-                    v_cl.SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
+                    v_cl.SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
                 }
                 else
-                	v_cl.SSendRecvP<send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte);
+                {v_cl.SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte);}
 
                 // fill g_opart_sz
                 g_opart_sz.resize(prc_g_opart.size());
@@ -1033,9 +1056,12 @@ public:
 	 * \param v_pos vector of particle positions
 	 * \param v_prp vector of particle properties
 	 * \param g_m ghost marker
+	 * \param opt options
 	 *
 	 */
-	template<unsigned int ... prp> void map_list_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m)
+	template<unsigned int ... prp>
+	void map_list_(openfpm::vector<Point<dim, St>> & v_pos,
+				   openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt = NONE)
 	{
 		typedef KillParticle obp;
 
@@ -1065,6 +1091,17 @@ public:
 			}
 		}
 
+		// In case we have receive option
+
+		if (opt & MAP_LOCAL)
+		{
+			// if the map is local we indicate that we receive only from the neighborhood processors
+
+			prc_recv_map.clear();
+			for (size_t i = 0 ; i < dec.getNNProcessors() ; i++)
+			{prc_recv_map.add(dec.IDtoProc(i));}
+		}
+
 		// Sending property object
 		typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
 
@@ -1075,8 +1112,8 @@ public:
 
 		fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp);
 
-		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map);
-		v_cl.SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map);
+		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
+		v_cl.SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
 
 		// mark the ghost part
 
@@ -1096,7 +1133,10 @@ public:
 	 * \param g_m ghost marker
 	 *
 	 */
-	template<typename obp = KillParticle> void map_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m)
+	template<typename obp = KillParticle>
+	void map_(openfpm::vector<Point<dim, St>> & v_pos,
+			  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+			  size_t & g_m, size_t opt = NONE)
 	{
 		// Processor communication size
 		openfpm::vector<size_t> prc_sz(v_cl.getProcessingUnits());
@@ -1131,8 +1171,8 @@ public:
 
 		fill_send_map_buf(v_pos,v_prp, prc_sz_r, m_pos, m_prp);
 
-		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map);
-		v_cl.SSendRecv(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map);
+		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt);
+		v_cl.SSendRecv(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt);
 
 		// mark the ghost part
 
@@ -1166,7 +1206,7 @@ public:
 	 * \return iteself
 	 *
 	 */
-	vector_dist_comm<dim,St,prop,Decomposition,Memory> & operator=(const vector_dist_comm<dim,St,prop,Decomposition,Memory> & vc)
+	vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> & operator=(const vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> & vc)
 	{
 		dec = vc.dec;
 
@@ -1180,7 +1220,7 @@ public:
 	 * \return itself
 	 *
 	 */
-	vector_dist_comm<dim,St,prop,Decomposition,Memory> & operator=(vector_dist_comm<dim,St,prop,Decomposition,Memory> && vc)
+	vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> & operator=(vector_dist_comm<dim,St,prop,layout,layout_base,Decomposition,Memory> && vc)
 	{
 		dec = vc.dec;
 
@@ -1198,7 +1238,11 @@ public:
 	 * \param opt options
 	 *
 	 */
-	template<template<typename,typename> class op, int ... prp> void ghost_put_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m, size_t opt)
+	template<template<typename,typename> class op, int ... prp>
+	void ghost_put_(openfpm::vector<Point<dim, St>> & v_pos,
+					openfpm::vector<prop> & v_prp,
+					size_t & g_m,
+					size_t opt)
 	{
 		// Sending property object
 		typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
@@ -1215,12 +1259,12 @@ public:
 			size_t opt_ = compute_options(opt);
 
 			op_ssend_recv_merge<op> opm(g_opart);
-			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_g_opart,g_opart_sz,opt_);
+			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_g_opart,g_opart_sz,opt_);
 		}
 		else
 		{
 			op_ssend_recv_merge<op> opm(g_opart);
-			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_recv_put,recv_sz_put);
+			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_recv_put,recv_sz_put);
 		}
 
 		// process also the local replicated particles
diff --git a/src/Vector/vector_dist_dlb_test.hpp b/src/Vector/vector_dist_dlb_test.hpp
index e7d5817d46a118159c074b89456a800abe6a21c1..71788a95f5ced8664649077f376f64ba8d7bf937 100644
--- a/src/Vector/vector_dist_dlb_test.hpp
+++ b/src/Vector/vector_dist_dlb_test.hpp
@@ -8,125 +8,16 @@
 #ifndef SRC_VECTOR_VECTOR_DIST_DLB_TEST_HPP_
 #define SRC_VECTOR_VECTOR_DIST_DLB_TEST_HPP_
 
-BOOST_AUTO_TEST_SUITE( vector_dist_dlb_test )
-
-template<typename vector_type> void test_dlb_vector()
-{
-	Vcluster & v_cl = create_vcluster();
-
-	if (v_cl.getProcessingUnits() > 8)
-		return;
-
-	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
-	Ghost<3,float> g(0.1);
-	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
-
-	vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
-
-	// Only processor 0 initialy add particles on a corner of a domain
-
-	if (v_cl.getProcessUnitID() == 0)
-	{
-		for(size_t i = 0 ; i < 50000 ; i++)
-		{
-			vd.add();
-
-			vd.getLastPos()[0] = ((float)rand())/RAND_MAX * 0.3;
-			vd.getLastPos()[1] = ((float)rand())/RAND_MAX * 0.3;
-			vd.getLastPos()[2] = ((float)rand())/RAND_MAX * 0.3;
-		}
-	}
-
-	vd.map();
-	vd.template ghost_get<>();
-
-	ModelSquare md;
-	md.factor = 10;
-	vd.addComputationCosts(md);
-	vd.getDecomposition().decompose();
-	vd.map();
-
-
-	vd.addComputationCosts(md);
-
-	openfpm::vector<size_t> loads;
-	size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
-	v_cl.allGather(load,loads);
-	v_cl.execute();
-
-	for (size_t i = 0 ; i < loads.size() ; i++)
-	{
-		float load_f = load;
-		float load_fc = loads.get(i);
-
-		BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
-	}
-
-	BOOST_REQUIRE(vd.size_local() != 0);
-
-	Point<3,float> v({1.0,1.0,1.0});
-
-	for (size_t i = 0 ; i < 25 ; i++)
-	{
-		// move the particles by 0.1
-
-		auto it = vd.getDomainIterator();
-
-		while (it.isNext())
-		{
-			auto p = it.get();
-
-			vd.getPos(p)[0] += v.get(0) * 0.09;
-			vd.getPos(p)[1] += v.get(1) * 0.09;
-			vd.getPos(p)[2] += v.get(2) * 0.09;
-
-			++it;
-		}
-		vd.map();
-
-		ModelSquare md;
-		vd.addComputationCosts(md);
-		vd.getDecomposition().redecompose(200);
-		vd.map();
-
-		BOOST_REQUIRE(vd.size_local() != 0);
-
-		vd.template ghost_get<>();
-
-		vd.addComputationCosts(md);
-
-		openfpm::vector<size_t> loads;
-		size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
-		v_cl.allGather(load,loads);
-		v_cl.execute();
-
-		for (size_t i = 0 ; i < loads.size() ; i++)
-		{
-			float load_f = load;
-			float load_fc = loads.get(i);
-
-			BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
-		}
-	}
-}
+#include "DLB/LB_Model.hpp"
+#include "vector_dist.hpp"
 
+BOOST_AUTO_TEST_SUITE( vector_dist_dlb_test )
 
-template<typename vector_type> void test_dlb_multi_phase_vector()
+template<typename vector_type>
+void mp_test_template(vector_type & vd0, vector_type & vd1, vector_type & vd2, vector_type & vd3)
 {
 	Vcluster & v_cl = create_vcluster();
 
-	if (v_cl.getProcessingUnits() > 8)
-		return;
-
-	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
-	Ghost<3,float> g(0.1);
-	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
-
-	vector_type vd0(0,domain,bc,g,DEC_GRAN(2048));
-	vector_type vd1(0,domain,bc,g,DEC_GRAN(2048));
-	vector_type vd2(0,domain,bc,g,DEC_GRAN(2048));
-	vector_type vd3(0,domain,bc,g,DEC_GRAN(2048));
-
 	// Only processor 0 initialy add particles on a corner of a domain
 
 	if (v_cl.getProcessUnitID() == 0)
@@ -320,6 +211,156 @@ template<typename vector_type> void test_dlb_multi_phase_vector()
 	}
 }
 
+template<typename vector_type> void test_dlb_vector()
+{
+	Vcluster & v_cl = create_vcluster();
+
+	if (v_cl.getProcessingUnits() > 8)
+		return;
+
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+	Ghost<3,float> g(0.1);
+	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
+
+	vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
+
+	// Only processor 0 initialy add particles on a corner of a domain
+
+	if (v_cl.getProcessUnitID() == 0)
+	{
+		for(size_t i = 0 ; i < 50000 ; i++)
+		{
+			vd.add();
+
+			vd.getLastPos()[0] = ((float)rand())/RAND_MAX * 0.3;
+			vd.getLastPos()[1] = ((float)rand())/RAND_MAX * 0.3;
+			vd.getLastPos()[2] = ((float)rand())/RAND_MAX * 0.3;
+		}
+	}
+
+	vd.map();
+	vd.template ghost_get<>();
+
+	ModelSquare md;
+	md.factor = 10;
+	vd.addComputationCosts(md);
+	vd.getDecomposition().decompose();
+	vd.map();
+
+
+	vd.addComputationCosts(md);
+
+	openfpm::vector<size_t> loads;
+	size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
+	v_cl.allGather(load,loads);
+	v_cl.execute();
+
+	for (size_t i = 0 ; i < loads.size() ; i++)
+	{
+		float load_f = load;
+		float load_fc = loads.get(i);
+
+		BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
+	}
+
+	BOOST_REQUIRE(vd.size_local() != 0);
+
+	Point<3,float> v({1.0,1.0,1.0});
+
+	for (size_t i = 0 ; i < 25 ; i++)
+	{
+		// move the particles by 0.1
+
+		auto it = vd.getDomainIterator();
+
+		while (it.isNext())
+		{
+			auto p = it.get();
+
+			vd.getPos(p)[0] += v.get(0) * 0.09;
+			vd.getPos(p)[1] += v.get(1) * 0.09;
+			vd.getPos(p)[2] += v.get(2) * 0.09;
+
+			++it;
+		}
+		vd.map();
+
+		ModelSquare md;
+		vd.addComputationCosts(md);
+		vd.getDecomposition().redecompose(200);
+		vd.map();
+
+		BOOST_REQUIRE(vd.size_local() != 0);
+
+		vd.template ghost_get<>();
+
+		vd.addComputationCosts(md);
+
+		openfpm::vector<size_t> loads;
+		size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
+		v_cl.allGather(load,loads);
+		v_cl.execute();
+
+		for (size_t i = 0 ; i < loads.size() ; i++)
+		{
+			float load_f = load;
+			float load_fc = loads.get(i);
+
+			BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
+		}
+	}
+}
+
+
+template<typename vector_type> void test_dlb_multi_phase_vector()
+{
+	Vcluster & v_cl = create_vcluster();
+
+	if (v_cl.getProcessingUnits() > 8)
+		return;
+
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+	Ghost<3,float> g(0.1);
+	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
+
+	vector_type vd0(0,domain,bc,g,DEC_GRAN(2048));
+	vector_type vd1(0,domain,bc,g,DEC_GRAN(2048));
+	vector_type vd2(0,domain,bc,g,DEC_GRAN(2048));
+	vector_type vd3(0,domain,bc,g,DEC_GRAN(2048));
+
+	mp_test_template(vd0,vd1,vd2,vd3);
+}
+
+
+
+template<typename vector_type> void test_dlb_multi_phase_v_vector()
+{
+	Vcluster & v_cl = create_vcluster();
+
+	if (v_cl.getProcessingUnits() > 8)
+		return;
+
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+	Ghost<3,float> g(0.1);
+	size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
+
+	openfpm::vector<vector_type> v_phases;
+	{
+		vector_type vd0(0,domain,bc,g,DEC_GRAN(2048));
+		v_phases.add(vd0);
+		v_phases.add(vector_type(vd0.getDecomposition(),0));
+		v_phases.add(vector_type(vd0.getDecomposition(),0));
+		v_phases.add(vector_type(vd0.getDecomposition(),0));
+	}
+
+	auto & vd0 = v_phases.get(0);
+	auto & vd1 = v_phases.get(1);
+	auto & vd2 = v_phases.get(2);
+	auto & vd3 = v_phases.get(3);
+
+	mp_test_template(vd0,vd1,vd2,vd3);
+}
+
 BOOST_AUTO_TEST_CASE( vector_dist_dlb_test_part )
 {
 	test_dlb_vector<vector_dist<3,float,aggregate<float>>>();
@@ -330,9 +371,19 @@ BOOST_AUTO_TEST_CASE( vector_dist_dlb_multi_phase_test_part )
 	test_dlb_multi_phase_vector<vector_dist<3,float,aggregate<float>>>();
 }
 
+BOOST_AUTO_TEST_CASE( vector_dist_dlb_multi_phase_v_test_part )
+{
+	test_dlb_multi_phase_v_vector<vector_dist<3,float,aggregate<float>>>();
+}
+
 BOOST_AUTO_TEST_CASE( vector_dist_dlb_metis_test_part )
 {
-	test_dlb_vector<vector_dist<3,float,aggregate<float>,CartDecomposition<3,float,HeapMemory,MetisDistribution<3,float>>>>();
+	test_dlb_vector<vector_dist<3,
+	                            float,
+								aggregate<float>,
+								memory_traits_lin<aggregate<float>>::type,
+								memory_traits_lin,
+	                            CartDecomposition<3,float,HeapMemory,MetisDistribution<3,float>>>>();
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Vector/vector_dist_multiphase_functions.hpp b/src/Vector/vector_dist_multiphase_functions.hpp
index 3d13593baa561a1cf212a7947c32211c49576eac..c9c9d235f97ccecf54cbd662072edbac4910256a 100644
--- a/src/Vector/vector_dist_multiphase_functions.hpp
+++ b/src/Vector/vector_dist_multiphase_functions.hpp
@@ -11,9 +11,11 @@
 #include "NN/CellList/CellListM.hpp"
 #include "NN/VerletList/VerletListM.hpp"
 
-template<typename Vector, typename CL, typename T> VerletList<Vector::dims,typename Vector::stype,FAST,shift<Vector::dims,typename Vector::stype>> createVerlet(Vector & v, Vector & v1, CL & cl, T r_cut)
+template<typename Vector, typename CL, typename T>
+VerletList<Vector::dims,typename Vector::stype,Mem_fast<>,shift<Vector::dims,typename Vector::stype>>
+createVerlet(Vector & v, Vector & v1, CL & cl, T r_cut)
 {
-	VerletList<Vector::dims,typename Vector::stype,FAST,shift<Vector::dims,typename Vector::stype>> ver;
+	VerletList<Vector::dims,typename Vector::stype,Mem_fast<>,shift<Vector::dims,typename Vector::stype>> ver;
 
 	ver.Initialize(cl,r_cut,v.getPosVector(),v1.getPosVector(),v.size_local());
 
@@ -72,9 +74,11 @@ template<unsigned int nbit, typename Vector, typename T> CellListM<Vector::dims,
 
 /////// Symmetric version
 
-template<typename Vector,typename CL, typename T> VerletList<Vector::dims,typename Vector::stype,FAST,shift<Vector::dims,typename Vector::stype>> createVerletSym(Vector & v, Vector & v1, CL & cl, T r_cut)
+template<typename Vector,typename CL, typename T>
+VerletList<Vector::dims,typename Vector::stype,Mem_fast<>,shift<Vector::dims,typename Vector::stype>>
+createVerletSym(Vector & v, Vector & v1, CL & cl, T r_cut)
 {
-	VerletList<Vector::dims,typename Vector::stype,FAST,shift<Vector::dims,typename Vector::stype>> ver;
+	VerletList<Vector::dims,typename Vector::stype,Mem_fast<>,shift<Vector::dims,typename Vector::stype>> ver;
 
 	ver.Initialize(cl,r_cut,v.getPosVector(),v1.getPosVector(),v.size_local());
 
@@ -88,7 +92,7 @@ template<unsigned int sh_byte, typename Vector, typename Vector1 ,typename CL, t
 	openfpm::vector<pos_v<Vector::dims,typename Vector::stype>> v_phases;
 
 	for (size_t i = 0 ; i < phases.size() ; i++)
-		v_phases.add(pos_v<Vector::dims,typename Vector::stype>(phases.get(i).getPosVector()));
+	{v_phases.add(pos_v<Vector::dims,typename Vector::stype>(phases.get(i).getPosVector()));}
 
 	ver.Initialize(cl,pp,r_cut,v.getPosVector(),v_phases,v.size_local(),VL_SYMMETRIC);
 
diff --git a/src/Vector/vector_dist_ofb.hpp b/src/Vector/vector_dist_ofb.hpp
index ec56d7bbbeec179e491e876c6cb64ec518fd6e2f..5e17e1ec52b56369d77f54dad6a38107dc3250c8 100644
--- a/src/Vector/vector_dist_ofb.hpp
+++ b/src/Vector/vector_dist_ofb.hpp
@@ -42,7 +42,7 @@ struct KillParticleWithWarning
 	 */
 	static size_t out(size_t pp_id, size_t p_id)
 	{
-		std::cerr << "Warning: " << __FILE__ << ":" << __LINE__ << " out of bound particle detected ";
+		std::cerr << "Warning: " << __FILE__ << ":" << __LINE__ << " out of bound particle detected " << std::endl;
 
 		return -1;
 	}
@@ -78,7 +78,7 @@ struct Error
 	 */
 	static size_t out(size_t pp_id, size_t p_id)
 	{
-		std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " out of bound particle detected ";
+		std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " out of bound particle detected " << std::endl;
 
 		exit(-1);
 
diff --git a/src/gargabe.hpp b/src/gargabe.hpp
index a82d59ed33d473deeee74dafadc2a1e9e0c9895a..9f9fbe2778e6b1f49b66e083d0f0e3ee5a404196 100644
--- a/src/gargabe.hpp
+++ b/src/gargabe.hpp
@@ -1159,4 +1159,67 @@ fix_ie_g_box();
 //////////////////////
 
 
+/*		Point<dim,long int> p;
+		p.get(0) = 0;
+		p.get(1) = 81;
+		p.get(2) = 79;
+		if (ib.isInside(p))
+		{
+			int debug = 0;
+			debug++;
+		}
+
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			if (sub_domain.getLow(i) == ib_dom.getLow(i) &&
+				(sub_domain_other.getHigh(i) == sub_domain.getLow(i) || cmb.c[i] == 1))
+			{
+				if (g.getHigh(i) != INVALID_GHOST && (ib.getHigh(i) - ib.getLow(i) + 1) > g.getHigh(i))
+				{
+					ib.setHigh(i,ib.getLow(i) + g.getHigh(i) - 1);
+				}
+			}
+
+			if (sub_domain.getHigh(i) == ib_dom.getHigh(i) &&
+				(sub_domain_other.getLow(i) == sub_domain.getHigh(i) || cmb.c[i] == 1))
+			{
+				if (g.getLow(i) != -INVALID_GHOST && (ib.getHigh(i) - ib.getLow(i) + 1) > abs(g.getLow(i)))
+				{
+					ib.setLow(i, g.getHigh(i) - g.getLow(i) + 1);
+				}
+			}
+		}
+
+		// This is a special case because a domain intersect itself by
+		// periodicity
+		if (sub_domain == sub_domain_other)
+		{
+			for (size_t i = 0 ; i < dim ; i++)
+			{
+				if (sub_domain.getLow(i) == ib_dom.getLow(i) &&
+					sub_domain.getLow(i) == domain.getLow(i) &&
+					sub_domain_other.getHigh(i) == domain.getHigh(i) &&
+					cmb.c[i] == 1)
+				{
+					if (g.getHigh(i) != INVALID_GHOST && (ib.getHigh(i) - ib.getLow(i) + 1) > g.getHigh(i))
+					{
+						ib.setHigh(i,ib.getLow(i) + g.getHigh(i) - 1);
+					}
+				}
+
+				if (sub_domain.getHigh(i) == ib_dom.getHigh(i) &&
+					sub_domain.getHigh(i) == domain.getHigh(i) &&
+					sub_domain_other.getLow(i) == sub_domain.getHigh(i) &&
+					cmb.c[i] == -1)
+				{
+					if (g.getLow(i) != -INVALID_GHOST && (ib.getHigh(i) - ib.getLow(i) + 1) > abs(g.getLow(i)))
+					{
+						ib.setLow(i, g.getHigh(i) - g.getLow(i) + 1);
+					}
+				}
+			}
+		}*/
+
+//////////////////////////
+
 #endif /* GARGABE_HPP_ */
diff --git a/src/main.cpp b/src/main.cpp
index 0c2e4403cf2ddb336370e7cd6fd4dd745775f114..64710443d94f49dced0147604d52275ea5872273 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -42,12 +42,9 @@ int main(int argc, char* argv[])
 
 #include "unit_test_init_cleanup.hpp"
 #include "Graph/CartesianGraphFactory_unit_test.hpp"
-#include "Decomposition/CartDecomposition_unit_test.hpp"
 #include "Decomposition/ORB_unit_test.hpp"
 #include "Decomposition/Distribution/metis_util_unit_test.hpp"
 #include "Decomposition/dec_optimizer_unit_test.hpp"
-#include "Vector/vector_dist_unit_test.hpp"
-#include "Vector/vector_dist_HDF5_chckpnt_restart_test.hpp"
 #include "Grid/grid_dist_id_HDF5_chckpnt_restart_test.hpp"
 #include "Decomposition/Distribution/Distribution_unit_tests.hpp"
 #include "Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp"
@@ -56,7 +53,6 @@ int main(int argc, char* argv[])
 #include "Graph/DistGraphFactory.hpp"
 #include "Decomposition/nn_processor_unit_test.hpp"
 #include "Grid/staggered_grid_dist_unit_test.hpp"
-#include "Vector/vector_dist_MP_unit_tests.hpp"
 #include "Vector/se_class3_vector_unit_tests.hpp"
 #include "Vector/vector_dist_dlb_test.hpp"
 #include "Decomposition/Domain_NN_calculator_cart_unit_test.hpp"