diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cfcd437516c382f8e0d51aaf2262cb3b5e4d4870..48fb17ac42c1e62d639fb062df3cb19682904b5b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,117 +1,119 @@
 
 
-# docker_centos_build:
-#   stage: build
-#   tags:
-#     - docker
-#   image: 'ubuntu:20.04'
-#   artifacts:
-#     paths:
-#       - ./build/src/pdata
-#       - ./build/openfpm_numerics/src/numerics
-#       - ./openfpm_numerics/test
-#   script:
-#     - apt-get update
-#     - DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
-#     - apt-get -y install cmake wget git g++ gfortran python2  python-is-python3
-#     - mkdir -p /root/openfpm_dependencies/openfpm_pdata/base
-#     - mkdir /root/.ssh && chmod 700 /root/.ssh
-#     - cp id_rsa.pub /root/.ssh/id_rsa.pub && chmod 644 /root/.ssh/id_rsa.pub
-#     - cp id_rsa /root/.ssh/id_rsa && chmod 600 /root/.ssh/id_rsa
-#     - ssh-keyscan -H git.mpi-cbg.de >> ~/.ssh/known_hosts
-#     - ./build.sh $CI_PROJECT_DIR unused pdata 0 base
-#   cache:
-#     key: $CI_COMMIT_REF_SLUG
-#     paths:
-#       - /root/openfpm_dependencies/openfpm_pdata/base
+docker_centos_build:
+   stage: build
+   tags:
+     - ubuntu-docker
+   image: 'ubuntu:20.04'
+   artifacts:
+     paths:
+       - ./build/src/pdata
+       - ./build/openfpm_numerics/src/numerics
+       - ./openfpm_numerics/test
+   script:
+     - apt-get update
+     - DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
+     - apt-get -y install cmake wget git g++ gfortran python2  python-is-python3
+     - mkdir -p /root/openfpm_dependencies/openfpm_pdata/base
+     - mkdir /root/.ssh && chmod 700 /root/.ssh
+     - cp id_rsa.pub /root/.ssh/id_rsa.pub && chmod 644 /root/.ssh/id_rsa.pub
+     - cp id_rsa /root/.ssh/id_rsa && chmod 600 /root/.ssh/id_rsa
+     - ssh-keyscan -H git.mpi-cbg.de >> ~/.ssh/known_hosts
+     - pwd
+     - cat base/test_file
+#     - ./build.sh $CI_PROJECT_DIR unused pdata 0 base &> out
+   cache:
+     key: $CI_COMMIT_REF_SLUG
+     paths:
+       - base/
 
-centos_build:
-  stage: build
-  tags:
-    - centos
-  artifacts:
-    paths:
-      - ./build/src/pdata
-      - ./build/openfpm_numerics/src/numerics
-      - ./openfpm_numerics/test
-  script:
-    - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME
+#centos_build:
+#  stage: build
+#  tags:
+#    - centos
+#  artifacts:
+#    paths:
+#      - ./build/src/pdata
+#      - ./build/openfpm_numerics/src/numerics
+#      - ./openfpm_numerics/test
+#  script:
+#    - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME
 
-centos_run:
-  stage: test
-  tags:
-    - centos
-  dependencies:
-    - centos_build
-  script:
-    - export OMP_NUM_THREADS=1
-    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME
-    - export OMP_NUM_THREADS=8
-    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
-    - export OMP_NUM_THREADS=1
-    - cd openfpm_numerics
-    - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME
+#centos_run:
+#  stage: test
+#  tags:
+#    - centos
+#  dependencies:
+#    - centos_build
+#  script:
+#    - export OMP_NUM_THREADS=1
+#    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME
+#    - export OMP_NUM_THREADS=8
+#    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
+#    - export OMP_NUM_THREADS=1
+#    - cd openfpm_numerics
+#    - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME
 
-mac_build:
-  variables:
-    GIT_STRATEGY: empty
-  stage: build
-  tags:
-    - mac
-  artifacts:
-    paths:
-      - ./build/src/pdata
-      - ./build/openfpm_numerics/src/numerics
-      - ./openfpm_numerics/test
-  script:
-      - ./build.sh $CI_PROJECT_DIR unused pdata 0  $CI_COMMIT_REF_NAME
+#mac_build:
+#  variables:
+#    GIT_STRATEGY: empty
+#  stage: build
+#  tags:
+#    - mac
+#  artifacts:
+#    paths:
+#      - ./build/src/pdata
+#      - ./build/openfpm_numerics/src/numerics
+#      - ./openfpm_numerics/test
+#  script:
+#      - ./build.sh $CI_PROJECT_DIR unused pdata 0  $CI_COMMIT_REF_NAME
 
-mac_run:
-  stage: test
-  tags:
-    - mac
-  dependencies:
-    - mac_build
-  script:
-    - ./run.sh $CI_PROJECT_DIR unused 1 0 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 $CI_COMMIT_REF_NAME
-    - cd openfpm_numerics
-    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME
+#mac_run:
+#  stage: test
+#  tags:
+#    - mac
+#  dependencies:
+#    - mac_build
+#  script:
+#    - ./run.sh $CI_PROJECT_DIR unused 1 0 0 $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 $CI_COMMIT_REF_NAME
+#    - cd openfpm_numerics
+#    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME
 
-ubuntu_build:
-  stage: build
-  tags:
-    - ubuntu
-  artifacts:
-    paths:
-      - ./build/src/pdata
-      - ./build/openfpm_numerics/src/numerics
-      - ./openfpm_numerics/test
-  script:
-    - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME
+#ubuntu_build:
+#  stage: build
+#  tags:
+#    - ubuntu
+#  artifacts:
+#    paths:
+#      - ./build/src/pdata
+#      - ./build/openfpm_numerics/src/numerics
+#      - ./openfpm_numerics/test
+#  script:
+#    - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME
 
-ubuntu_run:
-  stage: test
-  tags:
-    - ubuntu
-  dependencies:
-    - ubuntu_build
-  script:
-    - export OMP_NUM_THREADS=1
-    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME
-    - export OMP_NUM_THREADS=8
-    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
-    - export OMP_NUM_THREADS=1
-    - cd openfpm_numerics
-    - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME
-    - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME
+#ubuntu_run:
+#  stage: test
+#  tags:
+#    - ubuntu
+#  dependencies:
+#    - ubuntu_build
+#  script:
+#    - export OMP_NUM_THREADS=1
+#    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME
+#    - export OMP_NUM_THREADS=8
+#    - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME
+#    - export OMP_NUM_THREADS=1
+#    - cd openfpm_numerics
+#    - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME
+#    - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b9bc5364ff2113893e7647ad7a0046b48661815..415e0170da88763c6ca05687f8cacd152a037488 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,9 @@ find_package(LAPACK)
 find_package(Eigen3)
 find_package(SuiteSparse OPTIONAL_COMPONENTS UMFPACK)
 find_package(Vc)
-find_package(OpenMP)
+if (NOT CUDA_ON_BACKEND STREQUAL "HIP")
+	find_package(OpenMP)
+endif()
 if (CUDA_ON_BACKEND STREQUAL "HIP" AND NOT HIP_FOUND)
         find_package(HIP)
 endif()
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp
index 3725674c4cb484200a33c24bdaea03c878735bee..68e4a4c471715b682c919048aafe63dd6ad21f0c 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp
@@ -1,5 +1,5 @@
 //
-// Created by jstark on 2020-05-18.
+// Created by jstark on 2020-05-18. Updated on 2022-01-05.
 //
 /**
  * @file example_sussman_disk/main.cpp
@@ -125,6 +125,7 @@
 //! @cond [Initialization and output folder] @endcond
 int main(int argc, char* argv[])
 {
+	typedef double phi_type;
 	//	initialize library
 	openfpm_init(&argc, &argv);
 	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -169,7 +170,7 @@ int main(int argc, char* argv[])
 	 * * Define the grid size in terms of number of grid points per dimension
 	 * * Create a Box that defines our domain
 	 * * Create a Ghost object that will define the extension of the ghost part
-	 * * Create a 2D grid with two properties of type double, one for the pre-redistancing Phi_initial and one, where
+	 * * Create a 2D grid with two properties of type phi_type, one for the pre-redistancing Phi_initial and one, where
 	 *   the post-redistancing Phi_SDF should be written to.
 	 * * Set some property names (optionally. These names show up when opening the grid vtk in Paraview.)
 	 *
@@ -181,10 +182,10 @@ int main(int argc, char* argv[])
 	// Prop1: store the initial Phi;
 	// Prop2: here the re-initialized Phi (signed distance function) will be written to in the re-distancing step
 	size_t sz[grid_dim] = {128, 128}; // Grid size in terms of number of grid points per dimension
-	Box<grid_dim, double> box({0.0, 0.0}, {5.0, 5.0}); // 2D
+	Box<grid_dim, phi_type> box({0.0, 0.0}, {5.0, 5.0}); // 2D
 	Ghost<grid_dim, long int> ghost(0);
-	typedef aggregate<double, double> props;
-	typedef grid_dist_id<grid_dim, double, props > grid_in_type;
+	typedef aggregate<phi_type, phi_type> props;
+	typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type;
 	grid_in_type g_dist(sz, box, ghost);
 	g_dist.setPropNames({"Phi_0", "Phi_SDF"});
 	//! @cond [Grid creation] @endcond
@@ -210,7 +211,7 @@ int main(int argc, char* argv[])
 	 */
 	//! @cond [Get disk] @endcond
 	// Now we initialize the grid with a disk. Outside the disk, the value of Phi_0 will be -1, inside +1.
-	double radius = 1.0; // Radius of the disk
+	phi_type radius = 1.0; // Radius of the disk
 	init_grid_with_disk<Phi_0_grid>(g_dist, radius, 2.5, 2.5); // Initialize disk onto grid, centered at (2.5, 2.5)
 	
 	g_dist.write(path_output + "/grid_disk_preRedistancing_radius" + std::to_string((int)radius) , FORMAT_BINARY); // Save the disk as vtk file
@@ -247,7 +248,8 @@ int main(int argc, char* argv[])
 	 *                                        w.r.t the previous iteration and the residual is printed (Default: false).
 	 * * \p print_steadyState_iter: If true, the number of the steady-state-iteration, the corresponding change
 	 *                              w.r.t the previous iteration and the residual is printed (Default: false).
-	 * * \p save_temp_grid: If true, save the temporary grid as hdf5 that can be reloaded onto a grid (Default: false).
+	 * * \p save_temp_grid: If true, save the temporary grid every interval_check_convergence as hdf5 that can be
+	 * reloaded onto a grid (Default: false).
 	 *
 	 *
 	 * @snippet example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp Redistancing options
@@ -256,7 +258,7 @@ int main(int argc, char* argv[])
 	//! @cond [Redistancing options] @endcond
 	// Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1.
 	// For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options.
-	Redist_options redist_options;
+	Redist_options<phi_type> redist_options;
 	redist_options.min_iter                             = 1e3;
 	redist_options.max_iter                             = 1e4;
 	
@@ -293,7 +295,8 @@ int main(int argc, char* argv[])
 	 * @snippet example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp Run redistancing
 	 */
 	//! @cond [Run redistancing] @endcond
-	RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options);   // Instantiation of Sussman-redistancing class
+	RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options);   // Instantiation of
+	// Sussman-redistancing class
 	// std::cout << "dt = " << redist_obj.get_time_step() << std::endl;
 	// Run the redistancing. In the <> brackets provide property-index where 1.) your initial Phi is stored and 2.)
 	// where the resulting SDF should be written to.
@@ -325,9 +328,9 @@ int main(int argc, char* argv[])
 	// Minimum is 1 property, to which the Phi_SDF can be written
 	// In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for
 	// the magnitude of the gradient
-	typedef aggregate<double, double[grid_dim], double> props_nb;
-	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-	Ghost<grid_dim, double> ghost_vd(0);
+	typedef aggregate<phi_type, phi_type[grid_dim], phi_type> props_nb;
+	typedef vector_dist<grid_dim, phi_type, props_nb> vd_type;
+	Ghost<grid_dim, phi_type> ghost_vd(0);
 	vd_type vd_narrow_band(0, box, bc, ghost_vd);
 	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	//! @cond [Initialize narrow band] @endcond
@@ -345,14 +348,15 @@ int main(int argc, char* argv[])
 	 * belonged to the narrow band.
 	 *
 	 * Depending on the type of the variable which you define, the width can be either set in terms of number of
-	 * grid points (size_t), physical width (double) or extension of narrow band as physical width inside of object
-	 * and outside the object (double, double).
+	 * grid points (size_t), physical width (phi_type) or extension of narrow band as physical width inside of object
+	 * and outside the object (phi_type, phi_type).
 	 *
 	 * @snippet example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp Instantiate narrow band
 	 */
 	//! @cond [Instantiate narrow band] @endcond
 	size_t thickness_of_narrowBand_in_grid_points = 6;
-	NarrowBand<grid_in_type> narrowBand(g_dist, thickness_of_narrowBand_in_grid_points); // Instantiation of NarrowBand class
+	NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, thickness_of_narrowBand_in_grid_points); // Instantiation of
+	// NarrowBand class
 	//! @cond [Instantiate narrow band] @endcond
 	
 	/**
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp
index d17017f826bf433da5ece3fe3a3656c0611d0ec5..0f29601447fd0dae63bd04d227936c5c023caf6b 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp
@@ -1,5 +1,5 @@
 //
-// Created by jstark on 2020-05-18.
+// Created by jstark on 2020-05-18. Updated on 2022-01-05.
 //
 /**
  * @file example_sussman_images_2D/main.cpp
@@ -77,8 +77,8 @@ int main(int argc, char* argv[])
 //  std::string image_name = "triangle";
 //  std::string image_name = "face";
 	std::string image_name = "dolphin";
-
-
+	
+	typedef double phi_type;
 	//	initialize library
 	openfpm_init(&argc, &argv);
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -86,7 +86,7 @@ int main(int argc, char* argv[])
 	std::string cwd                     = get_cwd();
 	const std::string path_output       = cwd + "/output_" + image_name;
 //	const std::string path_output       = cwd + "/output_face";
-
+	
 	create_directory_if_not_exist(path_output);
 	
 	//////////////////////////////////////////////////////////////////////////////////////////////
@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
 	const std::string path_input        ="input/";
 	const std::string path_to_image     = path_input + image_name + ".bin";
 	const std::string path_to_size      = path_input + "size_" + image_name + ".csv";
-
+	
 	/*
 	 * in case of 2D (single image):
 	 */
@@ -127,7 +127,7 @@ int main(int argc, char* argv[])
 	 *
 	 */
 	//! @cond [Refinement] @endcond
-	const double refinement []          = {0.5, 0.5};   // (2D) factor by which grid should be finer / coarser as
+	const phi_type refinement []          = {0.5, 0.5};   // (2D) factor by which grid should be finer / coarser as
 	// underlying image in each dimension (e.g. to get isotropic grid from anisotropic image resolution)
 	//! @cond [Refinement] @endcond
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -161,7 +161,7 @@ int main(int argc, char* argv[])
 	
 	//  Array with grid dimensions after refinement. This size-array will be used for the grid creation.
 	const size_t sz[grid_dim] = {(size_t)std::round(stack_size[x] * refinement[x]), (size_t)std::round(stack_size[y] *
-							refinement[y])}; // 2D
+			                                                                                                   refinement[y])}; // 2D
 	//! @cond [Size] @endcond
 	// Here we create a 2D grid that stores 2 properties:
 	// Prop1: store the initial Phi;
@@ -180,24 +180,27 @@ int main(int argc, char* argv[])
 	 *
 	 */
 	//! @cond [Redistancing] @endcond
-	Box<grid_dim, double> box({0.0, 0.0}, {5.0, 5.0}); // 2D
+	Box<grid_dim, phi_type> box({0.0, 0.0}, {5.0, 5.0}); // 2D
 	
 	Ghost<grid_dim, long int> ghost(0);
-	typedef aggregate<double, double> props;
-	typedef grid_dist_id<grid_dim, double, props > grid_in_type;
+	typedef aggregate<phi_type, phi_type> props;
+	typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type;
 	grid_in_type g_dist(sz, box, ghost);
 	g_dist.setPropNames({"Phi_0", "Phi_SDF"});
 	
+	// initialize complete grid including ghost layer with -1
+	init_grid_and_ghost<Phi_0_grid>(g_dist, -1.0);
+	
 	// Now we can initialize the grid with the pixel values from the image stack
 	load_pixel_onto_grid<Phi_0_grid>(g_dist, path_to_image, stack_size);
 	g_dist.write(path_output + "/grid_from_images_initial", FORMAT_BINARY); // Save the initial grid as vtk file
-
-
+	
+	
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	// Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1
 	// For the initial re-distancing we use the Sussman method
 	// 1.) Set some redistancing options (for details see example sussman disk or sphere)
-	Redist_options redist_options;
+	Redist_options<phi_type> redist_options;
 	redist_options.min_iter                             = 1e3;
 	redist_options.max_iter                             = 1e4;
 	
@@ -212,7 +215,8 @@ int main(int argc, char* argv[])
 	redist_options.print_steadyState_iter               = true;
 	redist_options.save_temp_grid                       = true;
 	
-	RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options);   // Instantiation of Sussman-redistancing class
+	RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options);   // Instantiation of Sussman-redistancing
+	// class
 //	std::cout << "dt = " << redist_obj.get_time_step() << std::endl;
 	// Run the redistancing. in the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) where the resulting SDF should be written to.
 	redist_obj.run_redistancing<Phi_0_grid, Phi_SDF_grid>();
@@ -226,13 +230,14 @@ int main(int argc, char* argv[])
 	// Minimum is 1 property, to which the Phi_SDF can be written
 	// In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for
 	// the magnitude of the gradient
-	typedef aggregate<double, Point<grid_dim, double>, double> props_nb;
-	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-        Ghost<grid_dim, double> ghost_vd(0);
-        vd_type vd_narrow_band(0, box, bc, ghost_vd);
+	typedef aggregate<phi_type, Point<grid_dim, phi_type>, phi_type> props_nb;
+	typedef vector_dist<grid_dim, phi_type, props_nb> vd_type;
+	Ghost<grid_dim, phi_type> ghost_vd(0);
+	vd_type vd_narrow_band(0, box, bc, ghost_vd);
 	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	
-	NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class
+	NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of
+	// NarrowBand class
 	
 	// Get the narrow band. You can decide, if you only want the Phi_SDF saved to your particles or
 	// if you also want the gradients or gradients and magnitude of gradient.
@@ -244,7 +249,7 @@ int main(int argc, char* argv[])
 	narrowBand.get_narrow_band<Phi_SDF_grid, Phi_SDF_vd, Phi_grad_vd, Phi_magnOfGrad_vd>(g_dist, vd_narrow_band);
 	
 	vd_narrow_band.write(path_output + "/vd_narrow_band_images", FORMAT_BINARY); // Save particles as vtk file
-
+	
 	openfpm_finalize(); // Finalize openFPM library
 	return 0;
 }
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
index cbec16e6fb67a3e1a9a0496795fe25d48b24a9d2..cb75ca91b292bb25d15e4efb1a0083868f669f53 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
@@ -35,7 +35,7 @@
 /**
  * @page example_sussman_images_3D Images 3D
  *
- * ## Include ## {#e2d_img_include}
+ * ## Include ## {#e3d_img_include}
  *
  * These are the header files that we need to include:
  *
@@ -55,7 +55,7 @@
 /**
  * @page example_sussman_images_3D Images 3D
  *
- * ## Initialization, indices and output folder ## {#e2d_img_init}
+ * ## Initialization, indices and output folder ## {#e3d_img_init}
  *
  * Again we start with
  * * Initializing OpenFPM
@@ -74,6 +74,7 @@
 //! @cond [Initialization] @endcond
 int main(int argc, char* argv[])
 {
+	typedef double phi_type;
 	//	initialize library
 	openfpm_init(&argc, &argv);
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -108,7 +109,7 @@ int main(int argc, char* argv[])
 	/**
 	 * @page example_sussman_images_3D Images 3D
 	 *
-	 * ## Set refinement factor ## {#e2d_img_refine}
+	 * ## Set refinement factor ## {#e3d_img_refine}
 	 *
 	 * If we want that the grid has a different resolution as the image stack, we can set a refinement factor. In the
 	 * refinement array, we define by which factor the grid resolution should be changed w.r.t. the image stack
@@ -120,8 +121,8 @@ int main(int argc, char* argv[])
 	 *
 	 */
 	//! @cond [Refinement] @endcond
-	const double refinement []          = {1.0, 1.0, 1.0};      // without refinement
-//	const double refinement []          = {0.8, 1.5, 2.0};      // factors by which grid should be finer as underlying image stack in each dimension (e.g. to get isotropic grid from anisotropic stack resolution)
+	const phi_type refinement []          = {1.0, 1.0, 1.0};      // without refinement
+//	const phi_type refinement []          = {0.8, 1.5, 2.0};      // factors by which grid should be finer as underlying image stack in each dimension (e.g. to get isotropic grid from anisotropic stack resolution)
 	//! @cond [Refinement] @endcond
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	//	read the stack size (number of pixel values per dimension) from a binary file
@@ -129,7 +130,7 @@ int main(int argc, char* argv[])
 	/**
 	 * @page example_sussman_images_3D Images 3D
 	 *
-	 * ## Get size from image_size.csv ## {#e2d_img_size}
+	 * ## Get size from image_size.csv ## {#e3d_img_size}
 	 *
 	 * Here we read the stack size (number of pixel values per dimension) from a csv file. Alternatively, you can
 	 * directly define the stack size as: \p std::vector<size_t> \p stack_size \p {#pixels \p in \p x, \p #pixels \p in
@@ -162,7 +163,7 @@ int main(int argc, char* argv[])
 	/**
 	 * @page example_sussman_images_3D Images 3D
 	 *
-	 * ## Run Sussman redistancing and get narrow band ## {#e2d_img_redist}
+	 * ## Run Sussman redistancing and get narrow band ## {#e3d_img_redist}
 	 *
 	 * Once we have loaded the geometrical object from the 3D stack onto the grid, we can perform Sussman
 	 * redistancing and get the narrow band the same way as it is explained in detail here: @ref
@@ -173,14 +174,17 @@ int main(int argc, char* argv[])
 	 *
 	 */
 	//! @cond [Redistancing] @endcond
-	Box<grid_dim, double> box({0.0, 0.0, 0.0}, {5.0, 5.0, 5.0}); // 3D
+	Box<grid_dim, phi_type> box({0.0, 0.0, 0.0}, {5.0, 5.0, 5.0}); // 3D
 
 	Ghost<grid_dim, long int> ghost(0);
-	typedef aggregate<double, double> props;
-	typedef grid_dist_id<grid_dim, double, props > grid_in_type;
+	typedef aggregate<phi_type, phi_type> props;
+	typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type;
 	grid_in_type g_dist(sz, box, ghost);
 	g_dist.setPropNames({"Phi_0", "Phi_SDF"});
 	
+	// initialize complete grid including ghost layer with -1
+	init_grid_and_ghost<Phi_0_grid>(g_dist, -1.0);
+	
 	// Now we can initialize the grid with the pixel values from the image stack
 	load_pixel_onto_grid<Phi_0_grid>(g_dist, path_to_zstack, stack_size);
 	g_dist.write(path_output + "/grid_from_images_initial", FORMAT_BINARY); // Save the initial grid as vtk file
@@ -190,7 +194,7 @@ int main(int argc, char* argv[])
 	// Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1
 	// For the initial re-distancing we use the Sussman method
 	// 1.) Set some redistancing options (for details see example sussman disk or sphere)
-	Redist_options redist_options;
+	Redist_options<phi_type> redist_options;
 	redist_options.min_iter                             = 1e3;
 	redist_options.max_iter                             = 1e4;
 	
@@ -204,7 +208,8 @@ int main(int argc, char* argv[])
 	redist_options.print_current_iterChangeResidual     = true;
 	redist_options.print_steadyState_iter               = true;
 	redist_options.save_temp_grid                       = true;
-	RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options);   // Instantiation of Sussman-redistancing class
+	RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options);   // Instantiation of Sussman-redistancing
+	// class
 //	std::cout << "dt = " << redist_obj.get_time_step() << std::endl;
 	// Run the redistancing. in the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) where the resulting SDF should be written to.
 	redist_obj.run_redistancing<Phi_0_grid, Phi_SDF_grid>();
@@ -218,13 +223,14 @@ int main(int argc, char* argv[])
 	// Minimum is 1 property, to which the Phi_SDF can be written
 	// In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for
 	// the magnitude of the gradient
-	typedef aggregate<double, Point<grid_dim, double>, double> props_nb;
-	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-        Ghost<grid_dim, double> ghost_vd(0);
+	typedef aggregate<phi_type, Point<grid_dim, phi_type>, phi_type> props_nb;
+	typedef vector_dist<grid_dim, phi_type, props_nb> vd_type;
+        Ghost<grid_dim, phi_type> ghost_vd(0);
         vd_type vd_narrow_band(0, box, bc, ghost_vd);	
         vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	
-	NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class
+	NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of
+	// NarrowBand class
 	
 	// Get the narrow band. You can decide, if you only want the Phi_SDF saved to your particles or
 	// if you also want the gradients or gradients and magnitude of gradient.
@@ -245,7 +251,7 @@ int main(int argc, char* argv[])
 /**
  * @page example_sussman_images_3D Images 3D
  *
- * ## Full code ## {#e2d_img_full}
+ * ## Full code ## {#e3d_img_full}
  *
  * @include example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp
  */
diff --git a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp
index 966094a693f1b01130c8fdd9a9a8303bcab4e1f2..68d9d8aebc8bf21c8464af79b89974ecfde004b8 100644
--- a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp
+++ b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp
@@ -85,6 +85,7 @@
 //! @cond [Initialization and output folder] @endcond
 int main(int argc, char* argv[])
 {
+	typedef double phi_type;
 	//	initialize library
 	openfpm_init(&argc, &argv);
 	// Set current working directory, define output paths and create folders where output will be saved
@@ -129,7 +130,7 @@ int main(int argc, char* argv[])
 	 * * Define the grid size in terms of number of grid points per dimension
 	 * * Create a Box that defines our domain
 	 * * Create a Ghost object that will define the extension of the ghost part
-	 * * Create a 3D grid with two properties of type double, one for the pre-redistancing Phi_initial and one, where
+	 * * Create a 3D grid with two properties of type phi_type, one for the pre-redistancing Phi_initial and one, where
 	 *   the post-redistancing Phi_SDF should be written to.
 	 * * Set some property names (optionally. These names show up when opening the grid vtk in Paraview.)
 	 *
@@ -141,10 +142,10 @@ int main(int argc, char* argv[])
 	// Prop1: store the initial Phi;
 	// Prop2: here the re-initialized Phi (signed distance function) will be written to in the re-distancing step
 	const size_t sz[grid_dim] = {128, 128, 128};
-	Box<grid_dim, double> box({0.0, 0.0, 0.0}, {10.0, 10.0, 10.0});
+	Box<grid_dim, phi_type> box({0.0, 0.0, 0.0}, {10.0, 10.0, 10.0});
 	Ghost<grid_dim, long int> ghost(0);
-	typedef aggregate<double, double> props;
-	typedef grid_dist_id<grid_dim, double, props > grid_in_type;
+	typedef aggregate<phi_type, phi_type> props;
+	typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type;
 	grid_in_type g_dist(sz, box, ghost);
 	g_dist.setPropNames({"Phi_0", "Phi_SDF"});
 	//! @cond [Grid creation] @endcond
@@ -170,7 +171,7 @@ int main(int argc, char* argv[])
 	 */
 	//! @cond [Get sphere] @endcond
 	// Now we initialize the grid with a filled sphere. Outside the sphere, the value of Phi_0 will be -1, inside +1.
-	double radius = 1.0; // Radius of the sphere
+	phi_type radius = 1.0; // Radius of the sphere
 	init_grid_with_sphere<Phi_0_grid>(g_dist, radius, 5, 5, 5); // Initialize sphere onto grid, centered at (5, 5, 5)
 
 	g_dist.write(path_output + "/grid_initial_sphere_preRedistancing_radius" + std::to_string((int)radius) , FORMAT_BINARY); // Save the sphere as vtk file
@@ -217,7 +218,7 @@ int main(int argc, char* argv[])
 	//! @cond [Redistancing options] @endcond
 	// Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1.
 	// For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options.
-	Redist_options redist_options;
+	Redist_options<phi_type> redist_options;
 	redist_options.min_iter                             = 1e3;
 	redist_options.max_iter                             = 1e4;
 	
@@ -253,7 +254,8 @@ int main(int argc, char* argv[])
 	 * @snippet example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp Run redistancing
 	 */
 	//! @cond [Run redistancing] @endcond
-	RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options);   // Instantiation of Sussman-redistancing class
+	RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options);   // Instantiation of
+	// Sussman-redistancing class
     //	std::cout << "dt = " << redist_obj.get_time_step() << std::endl;
     // Run the redistancing. in the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) 
     // where the resulting SDF should be written to.
@@ -285,9 +287,9 @@ int main(int argc, char* argv[])
 	// Minimum is 1 property, to which the Phi_SDF can be written
 	// In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for
 	// the magnitude of the gradient
-	typedef aggregate<double, Point<grid_dim, double>, double> props_nb;
-	typedef vector_dist<grid_dim, double, props_nb> vd_type;
-        Ghost<grid_dim, double> ghost_vd(0);
+	typedef aggregate<phi_type, Point<grid_dim, phi_type>, phi_type> props_nb;
+	typedef vector_dist<grid_dim, phi_type, props_nb> vd_type;
+        Ghost<grid_dim, phi_type> ghost_vd(0);
         vd_type vd_narrow_band(0, box, bc, ghost_vd);
 	vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"});
 	//! @cond [Initialize narrow band] @endcond
@@ -305,14 +307,15 @@ int main(int argc, char* argv[])
 	 * belonged to the narrow band.
 	 *
 	 * Depending on the type of the variable which you define, the width can be either set in terms of number of
-	 * grid points (size_t), physical width (double) or extension of narrow band as physical width inside of object
-	 * and outside the object (double, double).
+	 * grid points (size_t), physical width (phi_type) or extension of narrow band as physical width inside of object
+	 * and outside the object (phi_type, phi_type).
 	 *
 	 * @snippet example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp Instantiate narrow band
 	 */
 	//! @cond [Instantiate narrow band] @endcond
 	size_t thickness_of_narrowBand_in_grid_points = 6;
-	NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class
+	NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of
+	// NarrowBand class
 
 //! @cond [Instantiate narrow band] @endcond
 	
diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile
index 869ea6a98e15487f863e86f4f3504a1d54f32c01..e78251aa5bc7a11f8d382e013340e0dbdf8c8ffc 100644
--- a/example/Performance/memBW/Makefile
+++ b/example/Performance/memBW/Makefile
@@ -53,7 +53,7 @@ memBW: $(OBJ)
 all: memBW
 
 run: memBW
-	mpirun --oversubscribe -np 2 ./memBW
+	mpirun --oversubscribe -np 1 ./memBW
 
 .PHONY: clean all run
 
diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index eb0b8c5ef0685df55efc8425163054e00b3d9ac8..ffcf93ae0fad1d97a312801b28af7f3e969abc0b 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -154,7 +154,7 @@ int main(int argc, char *argv[])
     openfpm::vector<double> res;
     res.resize(100);
 
-    for (int i = 0 ; i < 110 ; i++)
+/*    for (int i = 0 ; i < 110 ; i++)
     {
         cudaDeviceSynchronize();
         timer t;
@@ -206,7 +206,7 @@ int main(int argc, char *argv[])
     double dev_read_tls = 0.0;
     standard_deviation(res,mean_read_tls,dev_read_tls);
 
-    check_read(in,out);
+    check_read(in,out);*/
 
     //////////////
 
@@ -407,6 +407,52 @@ int main(int argc, char *argv[])
 
     check_read(in,out);
 
+    /////// BASE 1 core
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        timer t;
+        t.start();
+
+        float * out_s = (float *)out.getDeviceBuffer<0>();
+        float * out_v = (float *)out.getDeviceBuffer<1>();
+        float * out_m = (float *)out.getDeviceBuffer<2>();
+        float * in_v = (float *)in.getDeviceBuffer<0>();
+
+        int stride = out.capacity();
+
+        auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+        {
+            auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+            float a = out_s[p];
+
+            float b = out_v[p + 0*stride];
+            float c = out_v[p + 1*stride];
+
+            float d = out_m[p + 0*2*stride + 0*stride];
+            float e = out_m[p + 0*2*stride + 1*stride];
+            float f = out_m[p + 1*2*stride + 0*stride];
+            float g = out_m[p + 1*2*stride + 1*stride];
+
+            float h = in_v[p + 0*stride];
+            in_v[p + 1*stride] = a+b+c+d+e+f+g+h;
+        };
+
+	for (int i = 0 ; i < N ; i++)
+	{
+		lamb_arr_red(i);
+	}
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time ARR: " << t.getwct() << std::endl;
+        std::cout << "BW 1-CORE ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
     ///////////////////
 
     #ifdef CUDIFY_USE_CUDA
@@ -441,8 +487,8 @@ int main(int argc, char *argv[])
 
     #endif
 
-    std::cout << "Average READ with TLS: " << mean_read_tls << "  deviation: " << dev_read_tls << std::endl;
-    std::cout << "Average WRITE with TLS: " << mean_write_tls << "  deviation: " << dev_write_tls << std::endl;
+//    std::cout << "Average READ with TLS: " << mean_read_tls << "  deviation: " << dev_read_tls << std::endl;
+//    std::cout << "Average WRITE with TLS: " << mean_write_tls << "  deviation: " << dev_write_tls << std::endl;
 
     std::cout << "Average READ with lamb: " << mean_read_lamb << "  deviation: " << dev_read_lamb << std::endl;
     std::cout << "Average WRITE with lamb: " << mean_write_lamb << "  deviation: " << dev_write_lamb << std::endl;
diff --git a/example/Performance/memBW_multi/Makefile b/example/Performance/memBW_multi/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0bc4b91aeba230d1d4010000a0ca2b0995086cb7
--- /dev/null
+++ b/example/Performance/memBW_multi/Makefile
@@ -0,0 +1,62 @@
+include ../../example.mk
+
+### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+
+CC=mpic++
+ifdef HIP
+        CUDA_CC=hipcc
+        CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+        LIBS_SELECT=$(LIBS)
+        CC=hipcc
+	CUDA_CC_LINK=hipcc
+else
+	ifdef CUDA_ON_CPU
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+        	CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000
+        	LIBS_SELECT=$(LIBS)
+	else
+        	ifeq (, $(shell which nvcc))
+                	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                	INCLUDE_PATH_NVCC=
+                	CUDA_CC_LINK=mpic++
+			LIBS_SELECT=$(LIBS)
+        	else
+                	CUDA_CC=nvcc -ccbin=mpic++
+                	CUDA_CC_LINK=nvcc -ccbin=mpic++
+			LIBS_SELECT=$(LIBS_NVCC)
+        	endif
+	endif
+endif
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+memBW:
+
+%.o: %.cu
+	$(CUDA_CC) -O3 -g  $(CUDA_OPTIONS) $(OPT) -c --std=c++14  -o $@ $< $(INCLUDE_PATH_NVCC)
+
+%.o: %.cpp
+	$(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
+
+memBW: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
+
+all: memBW
+
+run: memBW
+	mpirun --oversubscribe -np 2 ./memBW
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core memBW
+
diff --git a/example/Performance/memBW_multi/config.cfg b/example/Performance/memBW_multi/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/Performance/memBW_multi/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/Performance/memBW_multi/main.cu b/example/Performance/memBW_multi/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7669858bb4d7a1dc49203a8d664e9f13ac453349
--- /dev/null
+++ b/example/Performance/memBW_multi/main.cu
@@ -0,0 +1,516 @@
+#ifdef __NVCC__
+
+#include "Vector/map_vector.hpp"
+#include "util/stat/common_statistics.hpp"
+
+#define NELEMENTS 67108864
+
+//! Memory bandwidth with small calculations
+template<typename vector_type, typename vector_type2>
+__global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
+{
+    grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
+                          blockIdx.y * blockDim.y + threadIdx.y,
+                          blockIdx.z * blockDim.z + threadIdx.z});
+
+	float a = vd_in.template get<0>(p)[0];
+
+	vd_out.template get<0>(p) = a;
+
+	vd_out.template get<1>(p)[0] = a;
+	vd_out.template get<1>(p)[1] = a;
+
+	vd_out.template get<2>(p)[0][0] = a;
+	vd_out.template get<2>(p)[0][1] = a;
+	vd_out.template get<2>(p)[1][0] = a;
+    vd_out.template get<2>(p)[1][1] = a;
+    vd_in.template get<0>(p)[1] = a;
+}
+
+template<typename vector_type, typename vector_type2>
+__global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
+{
+    grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
+        blockIdx.y * blockDim.y + threadIdx.y,
+        blockIdx.z * blockDim.z + threadIdx.z});
+
+	float a = vd_out.template get<0>(p);
+
+	float b = vd_out.template get<1>(p)[0];
+	float c = vd_out.template get<1>(p)[1];
+
+	float d = vd_out.template get<2>(p)[0][0];
+	float e = vd_out.template get<2>(p)[0][1];
+	float f = vd_out.template get<2>(p)[1][0];
+	float g = vd_out.template get<2>(p)[1][1];
+    
+	float h = vd_in.template get<0>(p)[0];
+	vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
+}
+
+
+template<typename in_type, typename out_type>
+void check_write(in_type & in, out_type & out)
+{
+    out.template deviceToHost<0,1,2>();
+    in.template deviceToHost<0>();
+
+    bool success = true;
+    auto it = in.getIterator();
+    while (it.isNext())
+    {
+        auto i = it.get();
+
+        float a = in.template get<0>(i)[0];
+
+        if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
+        {
+            success &= a != 0;
+        }
+
+        success &= out.template get<0>(i) == a;
+
+        success &= out.template get<1>(i)[0] == a;
+        success &= out.template get<1>(i)[1] == a;
+
+        success &= out.template get<2>(i)[0][0] == a;
+        success &= out.template get<2>(i)[0][1] == a;
+        success &= out.template get<2>(i)[1][0] == a;
+        success &= out.template get<2>(i)[1][1] == a;
+
+        success &= in.template get<0>(i)[1] == a;
+
+        if (success == false)
+        {
+            std::cout << "FAIL " << a << "   " << i.to_string() << "  " << out.template get<0>(i) << " " << out.template get<1>(i)[0] 
+                                                          << out.template get<1>(i)[1] << " " << out.template get<2>(i)[0][0]
+                                                          << out.template get<2>(i)[0][1] << " " << out.template get<2>(i)[1][0]
+                                                          << out.template get<2>(i)[1][0] << " " << out.template get<2>(i)[1][1]
+                                                        << std::endl;
+            break;
+        }
+
+        ++it;
+    }
+
+    if (success == false)
+    {
+            std::cout << "FAIL WRITE" << std::endl;
+            exit(1);
+    }
+}
+
+template<typename in_type, typename out_type>
+void check_read(in_type & in, out_type & out)
+{
+    out.template deviceToHost<0,1,2>();
+    in.template deviceToHost<0>();
+
+    bool success = true;
+    auto it = in.getIterator();
+    while (it.isNext())
+    {
+        auto i = it.get();
+
+        float a = out.template get<0>(i);
+
+        if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2)
+        {
+            success &= a != 0;
+        }
+
+        float b = out.template get<1>(i)[0];
+        float c = out.template get<1>(i)[1];
+
+        float d = out.template get<2>(i)[0][0];
+        float e = out.template get<2>(i)[0][1];
+        float f = out.template get<2>(i)[1][0];
+        float g = out.template get<2>(i)[1][1];
+
+        float h = in.template get<0>(i)[0];
+
+        success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h);
+
+        if (success == false)
+        {
+            std::cout << "FAIL READ " << i.to_string() << "   " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
+            exit(1);
+        }
+
+        ++it;
+    }
+}
+
+template<typename vector_type, typename vector_type2>
+__global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
+{
+    grid_key_dx<3,int> i({blockIdx.x * blockDim.x + threadIdx.x,
+        blockIdx.y * blockDim.y + threadIdx.y,
+        blockIdx.z * blockDim.z + threadIdx.z});
+
+    vd_in.template get<0>(i)[0] = i.get(0) + i.get(1) + i.get(2);
+    vd_in.template get<0>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
+
+    vd_out.template get<0>(i) = i.get(0) + i.get(1) + i.get(2)+200.0;
+
+    vd_out.template get<1>(i)[0] = i.get(0) + i.get(1) + i.get(2);
+    vd_out.template get<1>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0;
+
+    vd_out.template get<2>(i)[0][0] = i.get(0) + i.get(1) + i.get(2);
+    vd_out.template get<2>(i)[0][1] = i.get(0) + i.get(1) + i.get(2)+100.0;
+    vd_out.template get<2>(i)[1][0] = i.get(0) + i.get(1) + i.get(2)+200.0;
+    vd_out.template get<2>(i)[1][1] = i.get(0) + i.get(1) + i.get(2)+300.0;
+}
+
+template<typename vin_type, typename vout_type>
+void initialize_buf(vin_type & in, vout_type & out)
+{
+    auto ite = out.getGPUIterator({0,0,0},{511,511,255});
+    CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
+}
+
+int main(int argc, char *argv[])
+{
+    init_wrappers();
+
+    grid_gpu<3,aggregate<float,float[2],float[2][2]>> out;
+    grid_gpu<3,aggregate<float[2]>> in;
+
+    int nele = NELEMENTS;
+
+    size_t sz[3] = {512,512,256};
+    out.resize(sz);
+    in.resize(sz);
+
+    out.setMemory();
+    in.setMemory();
+
+    initialize_buf(in,out);
+
+    // Read write test with TLS
+
+    auto ite = out.getGPUIterator({0,0,0},{511,511,255});
+
+    openfpm::vector<double> res;
+    res.resize(100);
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+
+        CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write_tls = 0.0;
+    double dev_write_tls = 0.0;
+    standard_deviation(res,mean_write_tls,dev_write_tls);
+
+    check_write(in,out);
+
+    initialize_buf(in,out);
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+
+        CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_read_tls = 0.0;
+    double dev_read_tls = 0.0;
+    standard_deviation(res,mean_read_tls,dev_read_tls);
+
+    check_read(in,out);
+
+    //////////////
+
+    /////////////////////////////////////////// LAMBDA //////////////////////////////////////////
+
+    initialize_buf(in,out);
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+        auto vd_out = out.toKernel();
+        auto vd_in = in.toKernel();
+
+        auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+        {
+            grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
+                blockIdx.y * blockDim.y + threadIdx.y,
+                blockIdx.z * blockDim.z + threadIdx.z});
+
+            float a = vd_in.template get<0>(p)[0];
+
+            vd_out.template get<0>(p) = a;
+
+            vd_out.template get<1>(p)[0] = a;
+            vd_out.template get<1>(p)[1] = a;
+        
+            vd_out.template get<2>(p)[0][0] = a;
+            vd_out.template get<2>(p)[0][1] = a;
+            vd_out.template get<2>(p)[1][0] = a;
+            vd_out.template get<2>(p)[1][1] = a;
+            vd_in.template get<0>(p)[1] = a;
+        };
+
+        CUDA_LAUNCH_LAMBDA(ite, lamb);
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write_lamb = 0.0;
+    double dev_write_lamb = 0.0;
+    standard_deviation(res,mean_write_lamb,dev_write_lamb);
+
+    initialize_buf(in,out);
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+
+        auto vd_out = out.toKernel();
+        auto vd_in = in.toKernel();
+
+        auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+                            {
+                                grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
+                                    blockIdx.y * blockDim.y + threadIdx.y,
+                                    blockIdx.z * blockDim.z + threadIdx.z});
+
+                                float a = vd_out.template get<0>(p);
+
+                                float b = vd_out.template get<1>(p)[0];
+                                float c = vd_out.template get<1>(p)[1];
+                            
+                                float d = vd_out.template get<2>(p)[0][0];
+                                float e = vd_out.template get<2>(p)[0][1];
+                                float f = vd_out.template get<2>(p)[1][0];
+                                float g = vd_out.template get<2>(p)[1][1];
+                                
+                                float h = vd_in.template get<0>(p)[0];
+                                vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
+                            };
+
+        CUDA_LAUNCH_LAMBDA(ite, lamb);
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_read_lamb = 0.0;
+    double dev_read_lamb = 0.0;
+    standard_deviation(res,mean_read_lamb,dev_read_lamb);
+
+    // Array benchmark
+    initialize_buf(in,out);
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+	    float * out_s = (float *)out.getDeviceBuffer<0>();
+	    float * out_v = (float *)out.getDeviceBuffer<1>();
+	    float * out_m = (float *)out.getDeviceBuffer<2>();
+        float * in_v = (float *)in.getDeviceBuffer<0>();
+        
+        int sz0 = sz[0];
+        int sz1 = sz[1];
+        int sz2 = sz[2];
+        int stride = out.size();
+
+        auto lamb_arr_write = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+        {
+            auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
+            auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
+            auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
+
+            float a = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
+
+            out_s[p1 + p2*sz0 + p3*sz0*sz1] = a;
+        
+            out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride] = a;
+            out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
+        
+            out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride ] = a;
+            out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride ] = a;
+            out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride ] = a;
+            out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride ] = a;
+            in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a;
+        };
+
+        CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write);
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time ARR: " << t.getwct() << std::endl;
+        std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write_arr = 0.0;
+    double dev_write_arr = 0.0;
+    standard_deviation(res,mean_write_arr,dev_write_arr);
+
+    check_write(in,out);
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+	    float * out_s = (float *)out.getDeviceBuffer<0>();
+	    float * out_v = (float *)out.getDeviceBuffer<1>();
+	    float * out_m = (float *)out.getDeviceBuffer<2>();
+        float * in_v = (float *)in.getDeviceBuffer<0>();
+        
+        int sz0 = sz[0];
+        int sz1 = sz[1];
+        int sz2 = sz[2];
+        int stride = out.size();
+
+        auto lamb_arr_red = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+        {
+            auto p1 = blockIdx.x * blockDim.x + threadIdx.x;
+            auto p2 = blockIdx.y * blockDim.y + threadIdx.y;
+            auto p3 = blockIdx.z * blockDim.z + threadIdx.z;
+
+            float a = out_s[p1 + p2*sz0 + p3*sz0*sz1];
+        
+            float b = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
+            float c = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride];
+        
+            float d = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride];
+            float e = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride];
+            float f = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride];
+            float g = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride];
+            
+            float h = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride];
+            in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a+b+c+d+e+f+g+h;
+        };
+
+        CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red);
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time ARR: " << t.getwct() << std::endl;
+        std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_read_arr = 0.0;
+    double dev_read_arr = 0.0;
+    standard_deviation(res,mean_read_arr,dev_read_arr);
+
+    check_read(in,out);
+
+    ///////////////////
+
+    #ifdef CUDIFY_USE_CUDA
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+        float * a = (float *)in.getDeviceBuffer<0>();
+        float * b = (float *)out.getDeviceBuffer<1>();
+
+        cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice);
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = (double)nele*4*4 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << (double)nele*4*4 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }    
+
+    double mean_read_mes = 0.0;
+    double dev_read_mes = 0.0;
+    standard_deviation(res,mean_read_mes,dev_read_mes);
+
+    std::cout << "Average measured: " << mean_read_mes << "  deviation: " << dev_read_mes << std::endl;
+
+    #endif
+
+    std::cout << "Average READ with TLS: " << mean_read_tls << "  deviation: " << dev_read_tls << std::endl;
+    std::cout << "Average WRITE with TLS: " << mean_write_tls << "  deviation: " << dev_write_tls << std::endl;
+
+    std::cout << "Average READ with lamb: " << mean_read_lamb << "  deviation: " << dev_read_lamb << std::endl;
+    std::cout << "Average WRITE with lamb: " << mean_write_lamb << "  deviation: " << dev_write_lamb << std::endl;
+
+    std::cout << "Average WRITE with array: " << mean_write_arr << "  deviation: " << dev_write_arr << std::endl;
+    std::cout << "Average READ with array: " << mean_read_arr << "  deviation: " << dev_read_arr << std::endl;
+}
+
+#else
+
+int main(int argc, char *argv[])
+{
+}
+
+#endif
+
diff --git a/example/Performance/miniBUDE/main.cu b/example/Performance/miniBUDE/main.cu
index d845ef21f395f59082f687ca046e392cfb301b3c..9bc27e11716006d35300e627d5871cc69c013b92 100644
--- a/example/Performance/miniBUDE/main.cu
+++ b/example/Performance/miniBUDE/main.cu
@@ -606,7 +606,7 @@ void loadParameters(int argc, char *argv[], OpenFPM & _openfpm)
   fclose(file);
 }
 
-#ifndef __APPLE__
+#if !defined(__APPLE__) && !defined(__powerpc64__)
 #include <fenv.h>
 #include <xmmintrin.h>
 #include <pmmintrin.h>
@@ -614,7 +614,7 @@ void loadParameters(int argc, char *argv[], OpenFPM & _openfpm)
 
 int main(int argc, char *argv[])
 {
-#ifndef __APPLE__
+#if !defined(__APPLE__) && !defined(__powerpc64__)
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
 #endif
diff --git a/example/Vector/0_simple/main.cpp b/example/Vector/0_simple/main.cpp
index cb4059b2b9e83058adb008c589b98370743db236..c7f292a091c18ae83fce9b01d196fa0db6a138eb 100644
--- a/example/Vector/0_simple/main.cpp
+++ b/example/Vector/0_simple/main.cpp
@@ -65,9 +65,12 @@
 #include "Vector/vector_dist.hpp"
 //! \cond [inclusion] \endcond
 
+
 int main(int argc, char* argv[])
 {
 
+	std::cout << is_layout_inte<memory_traits_custom_separation_of_buffer<aggregate<int>>>::value << std::endl;
+
 	/*!
 	 * \page Vector_0_simple Vector 0 simple
 	 *
diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
index bb4757617af0646f583365875fea074239a3f32a..181470d623c1dc2cb7efbd777a557c247a21e9e8 100644
--- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
@@ -987,7 +987,7 @@ int main(int argc, char* argv[])
 	{
 		Vcluster<> & v_cl = create_vcluster();
 		timer it_time;
-
+		it_time.start();
 
 		////// Do rebalancing every 200 timesteps
 		it_reb++;
diff --git a/openfpm_io b/openfpm_io
index 3f01780ef8e9772b70db41f38df7ff8d97fc2b6b..037e4db6149cee2a40e647ddfc93f5505a52721f 160000
--- a/openfpm_io
+++ b/openfpm_io
@@ -1 +1 @@
-Subproject commit 3f01780ef8e9772b70db41f38df7ff8d97fc2b6b
+Subproject commit 037e4db6149cee2a40e647ddfc93f5505a52721f
diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp
index 7c52034ff6e7b6af27fa65cd5be945b8938d8e04..e639816c6515ecc0013cbb450ec0affb763aa3b7 100644
--- a/src/Decomposition/CartDecomposition.hpp
+++ b/src/Decomposition/CartDecomposition.hpp
@@ -157,6 +157,12 @@ public:
 	//! This class admit a class defined on an extended domain
 	typedef CartDecomposition_ext<dim,T,Memory,layout_base,Distribution> extended_type;
 
+	typedef ie_loc_ghost<dim, T,layout_base, Memory> ie_loc_ghost_type;
+
+	typedef nn_prcs<dim, T,layout_base,Memory> nn_prcs_type;
+
+	typedef ie_ghost<dim,T,Memory,layout_base> ie_ghost_type;
+
 protected:
 
 	//! bool that indicate whenever the buffer has been already transfer to device
@@ -1016,6 +1022,47 @@ public:
 		return *this;
 	}
 
+	/*! \brief Copy the element
+	 *
+	 * \param cart element to copy
+	 *
+	 * \return itself
+	 *
+	 */
+	template<typename CartDecomposition2>
+	CartDecomposition<dim,T,Memory, layout_base, Distribution> & operator=(const CartDecomposition2 & cart)
+	{
+		static_cast<ie_loc_ghost<dim,T,layout_base,Memory>*>(this)->operator=(static_cast<typename CartDecomposition2::ie_loc_ghost_type>(cart));
+		static_cast<nn_prcs<dim,T,layout_base,Memory>*>(this)->operator=(static_cast<typename CartDecomposition2::nn_prcs_type>(cart));
+		static_cast<ie_ghost<dim,T,Memory,layout_base>*>(this)->operator=(static_cast<typename CartDecomposition2::ie_ghost_type>(cart));
+
+		sub_domains = cart.private_get_sub_domains();
+		box_nn_processor = cart.private_get_box_nn_processor();
+		fine_s = cart.private_get_fine_s();
+		gr = cart.private_get_gr();
+		gr_dist = cart.private_get_gr_dist();
+		dist = cart.private_get_dist();
+		commCostSet = cart.private_get_commCostSet();
+		cd = cart.private_get_cd();
+		domain = cart.private_get_domain();
+		sub_domains_global = cart.private_get_sub_domains_global();
+
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			spacing[i] = cart.private_get_spacing(i);
+			magn[i] = cart.private_get_magn(i);
+		};
+
+		ghost = cart.private_get_ghost();
+
+		bbox = cart.private_get_bbox();
+
+		for (size_t i = 0 ; i < dim ; i++)
+		{bc[i] = cart.private_get_bc(i);}
+
+		return *this;
+	}
+
 	/*! \brief Copy the element, move semantic
 	 *
 	 * \param cart element to copy
@@ -2099,6 +2146,16 @@ public:
 		return sub_domains;
 	}
 
+	/*! \brief Return the internal data structure sub_domains
+	 *
+	 * \return sub_domains
+	 *
+	 */
+	const openfpm::vector<SpaceBox<dim, T>> & private_get_sub_domains() const
+	{
+		return sub_domains;
+	}
+
 	/*! \brief Return the internal data structure box_nn_processor
 	 *
 	 * \return box_nn_processor
@@ -2109,6 +2166,16 @@ public:
 		return box_nn_processor;
 	}
 
+	/*! \brief Return the internal data structure box_nn_processor
+	 *
+	 * \return box_nn_processor
+	 *
+	 */
+	const openfpm::vector<openfpm::vector<long unsigned int> > & private_get_box_nn_processor() const
+	{
+		return box_nn_processor;
+	}
+
 	/*! \brief Return the internal data structure fine_s
 	 *
 	 * \return fine_s
@@ -2119,6 +2186,16 @@ public:
 		return fine_s;
 	}
 
+	/*! \brief Return the internal data structure fine_s
+	 *
+	 * \return fine_s
+	 *
+	 */
+	const CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> & private_get_fine_s() const
+	{
+		return fine_s;
+	}
+
 	/*! \brief Return the internal data structure gr
 	 *
 	 * \return gr
@@ -2129,6 +2206,16 @@ public:
 		return gr;
 	}
 
+	/*! \brief Return the internal data structure gr
+	 *
+	 * \return gr
+	 *
+	 */
+	const grid_sm<dim, void> & private_get_gr() const
+	{
+		return gr;
+	}
+
 	/*! \brief Return the internal data structure gr_dist
 	 *
 	 * \return gr_dist
@@ -2139,6 +2226,16 @@ public:
 		return gr_dist;
 	}
 
+	/*! \brief Return the internal data structure gr_dist
+	 *
+	 * \return gr_dist
+	 *
+	 */
+	const grid_sm<dim, void> & private_get_gr_dist() const
+	{
+		return gr_dist;
+	}
+
 	/*! \brief Return the internal data structure dist
 	 *
 	 * \return dist
@@ -2149,6 +2246,16 @@ public:
 		return dist;
 	}
 
+	/*! \brief Return the internal data structure dist
+	 *
+	 * \return dist
+	 *
+	 */
+	const Distribution & private_get_dist() const
+	{
+		return dist;
+	}
+
 	/*! \brief Return the internal data structure commCostSet
 	 *
 	 * \return commCostSet
@@ -2159,6 +2266,16 @@ public:
 		return commCostSet;
 	}
 
+	/*! \brief Return the internal data structure commCostSet
+	 *
+	 * \return commCostSet
+	 *
+	 */
+	const bool & private_get_commCostSet() const
+	{
+		return commCostSet;
+	}
+
 	/*! \brief Return the internal data structure cd
 	 *
 	 * \return cd
@@ -2169,6 +2286,16 @@ public:
 		return cd;
 	}
 
+	/*! \brief Return the internal data structure cd
+	 *
+	 * \return cd
+	 *
+	 */
+	const CellDecomposer_sm<dim, T, shift<dim,T>> & private_get_cd() const
+	{
+		return cd;
+	}
+
 	/*! \brief Return the internal data structure domain
 	 *
 	 * \return domain
@@ -2179,6 +2306,16 @@ public:
 		return domain;
 	}
 
+	/*! \brief Return the internal data structure domain
+	 *
+	 * \return domain
+	 *
+	 */
+	const ::Box<dim,T> & private_get_domain() const
+	{
+		return domain;
+	}
+
 	/*! \brief Return the internal data structure sub_domains_global
 	 *
 	 * \return sub_domains_global
@@ -2189,6 +2326,16 @@ public:
 		return sub_domains_global;
 	}
 
+	/*! \brief Return the internal data structure sub_domains_global
+	 *
+	 * \return sub_domains_global
+	 *
+	 */
+	const openfpm::vector<Box_map<dim, T>,Memory,layout_base> & private_get_sub_domains_global() const
+	{
+		return sub_domains_global;
+	}
+
 	/*! \brief Return the internal data structure spacing
 	 *
 	 * \return spacing
@@ -2199,6 +2346,37 @@ public:
 		return spacing[i];
 	}
 
+	/*! \brief Return the internal data structure spacing
+	 *
+	 * \return spacing
+	 *
+	 */
+	const T & private_get_spacing(int i) const
+	{
+		return spacing[i];
+	}
+
+	/*! \brief Return the internal data structure magn
+	 *
+	 * \return magn
+	 *
+	 */
+	T & private_get_magn(int i)
+	{
+		return spacing[i];
+	}
+
+	/*! \brief Return the internal data structure magn
+	 *
+	 * \return magn
+	 *
+	 */
+	const T & private_get_magn(int i) const
+	{
+		return spacing[i];
+	}
+
+
 	/*! \brief Return the internal data structure ghost
 	 *
 	 * \return ghost
@@ -2209,6 +2387,16 @@ public:
 		return ghost;
 	}
 
+	/*! \brief Return the internal data structure ghost
+	 *
+	 * \return ghost
+	 *
+	 */
+	const Ghost<dim,T> & private_get_ghost() const
+	{
+		return ghost;
+	}
+
 	/*! \brief Return the internal data structure bbox
 	 *
 	 * \return bbox
@@ -2219,6 +2407,16 @@ public:
 		return bbox;
 	}
 
+	/*! \brief Return the internal data structure bbox
+	 *
+	 * \return bbox
+	 *
+	 */
+	const ::Box<dim,T> & private_get_bbox() const
+	{
+		return bbox;
+	}
+
 	/*! \brief Return the internal data structure bc
 	 *
 	 * \return bc
@@ -2228,6 +2426,16 @@ public:
 	{
 		return bc[i];
 	}
+
+	/*! \brief Return the internal data structure bc
+	 *
+	 * \return bc
+	 *
+	 */
+	const size_t & private_get_bc(int i) const
+	{
+		return bc[i];
+	}
 };
 
 
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index 2eadea6bad6427ae702d8ffadb48510c5ccf9a6d..2e8a9633ff7efcd5ded73c6e6c2cfb6420a0fbea 100644
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -581,6 +581,27 @@ public:
 		return *this;
 	}
 
+	//! Copy operator
+	template<typename Memory2, template<typename> class layout_base2>
+	inline ie_ghost<dim,T,Memory,layout_base> & operator=(const ie_ghost<dim,T,Memory2,layout_base2> & ie)
+	{
+		box_nn_processor_int = ie.private_get_box_nn_processor_int();
+		proc_int_box = ie.private_get_proc_int_box();
+		vb_ext = ie.private_get_vb_ext();
+		vb_int = ie.private_get_vb_int();
+		vb_int_box = ie.private_get_vb_int_box();
+		geo_cell = ie.private_geo_cell();
+		shifts = ie.private_get_shifts();
+		ids_p = ie.private_get_ids_p();
+		ids = ie.private_get_ids();
+		domain = ie.private_get_domain();
+
+		for (int i = 0 ; i < dim ; i++)
+		{bc[i] = ie.private_get_bc(i);}
+
+		return *this;
+	}
+
 
 
 	/*! \brief duplicate this structure changing layout and Memory
@@ -1267,6 +1288,16 @@ public:
 		ids.clear();
 	}
 
+	/*! \brief Return the internal data structure box_nn_processor_int
+	 *
+	 * \return box_nn_processor_int
+	 *
+	 */
+	inline const openfpm::vector< openfpm::vector< Box_proc<dim,T> > > & private_get_box_nn_processor_int() const
+	{
+		return box_nn_processor_int;
+	}
+
 	/*! \brief Return the internal data structure box_nn_processor_int
 	 *
 	 * \return box_nn_processor_int
@@ -1287,6 +1318,16 @@ public:
 		return proc_int_box;
 	}
 
+	/*! \brief Return the internal data structure proc_int_box
+	 *
+	 * \return proc_int_box
+	 *
+	 */
+	inline const openfpm::vector< Box_dom<dim,T> > & private_get_proc_int_box() const
+	{
+		return proc_int_box;
+	}
+
 	/*! \brief Return the internal data structure vb_ext
 	 *
 	 * \return vb_ext
@@ -1297,6 +1338,16 @@ public:
 		return vb_ext;
 	}
 
+	/*! \brief Return the internal data structure vb_ext
+	 *
+	 * \return vb_ext
+	 *
+	 */
+	inline const openfpm::vector<p_box<dim,T> > & private_get_vb_ext() const
+	{
+		return vb_ext;
+	}
+
 	/*! \brief Return the internal data structure vb_int
 	 *
 	 * \return vb_int
@@ -1308,6 +1359,17 @@ public:
 		return vb_int;
 	}
 
+	/*! \brief Return the internal data structure vb_int
+	 *
+	 * \return vb_int
+	 *
+	 */
+	inline const openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,layout_base> &
+	private_get_vb_int() const
+	{
+		return vb_int;
+	}
+
 	/*! \brief Return the internal data structure vb_int_box
 	 *
 	 * \return vb_int_box
@@ -1319,6 +1381,17 @@ public:
 		return vb_int_box;
 	}
 
+	/*! \brief Return the internal data structure vb_int_box
+	 *
+	 * \return vb_int_box
+	 *
+	 */
+	inline const openfpm::vector<Box<dim,T>,Memory,layout_base> &
+	private_get_vb_int_box() const
+	{
+		return vb_int_box;
+	}
+
 	/*! \brief Return the internal data structure proc_int_box
 	 *
 	 * \return proc_int_box
@@ -1330,6 +1403,17 @@ public:
 		return geo_cell;
 	}
 
+	/*! \brief Return the internal data structure proc_int_box
+	 *
+	 * \return proc_int_box
+	 *
+	 */
+	inline const CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> &
+	private_geo_cell() const
+	{
+		return geo_cell;
+	}
+
 	/*! \brief Return the internal data structure shifts
 	 *
 	 * \return shifts
@@ -1341,6 +1425,17 @@ public:
 		return shifts;
 	}
 
+	/*! \brief Return the internal data structure shifts
+	 *
+	 * \return shifts
+	 *
+	 */
+	inline const openfpm::vector<Point<dim,T>,Memory,layout_base> &
+	private_get_shifts() const
+	{
+		return shifts;
+	}
+
 	/*! \brief Return the internal data structure ids_p
 	 *
 	 * \return ids_p
@@ -1352,6 +1447,17 @@ public:
 		return ids_p;
 	}
 
+	/*! \brief Return the internal data structure ids_p
+	 *
+	 * \return ids_p
+	 *
+	 */
+	inline const openfpm::vector<std::pair<size_t,size_t>> &
+	private_get_ids_p() const
+	{
+		return ids_p;
+	}
+
 	/*! \brief Return the internal data structure ids_p
 	 *
 	 * \return ids_p
@@ -1363,6 +1469,17 @@ public:
 		return ids;
 	}
 
+	/*! \brief Return the internal data structure ids_p
+	 *
+	 * \return ids_p
+	 *
+	 */
+	inline const openfpm::vector<size_t> &
+	private_get_ids() const
+	{
+		return ids;
+	}
+
 	/*! \brief Return the internal data structure domain
 	 *
 	 * \return domain
@@ -1374,6 +1491,22 @@ public:
 		return domain;
 	}
 
+	/*! \brief Return the internal data structure domain
+	 *
+	 * \return domain
+	 *
+	 */
+	inline const Box<dim,T> &
+	private_get_domain() const
+	{
+		return domain;
+	}
+
+	size_t private_get_bc(int i) const
+	{
+		return bc[i];
+	}
+
 	/*! \brief Return the internal data structure domain
 	 *
 	 * \return domain
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 1a0e885f9f6d373d33954d10ab7f1b3ddfba2783..cd006f9c87d42201f138cac99ee95086fd1c7b1f 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -1314,10 +1314,11 @@ public:
      * \param ghost Ghost part
      *
      */
-    grid_dist_id(const Decomposition & dec,
+	template<typename Decomposition2>
+    grid_dist_id(const Decomposition2 & dec,
     		     const size_t (& g_sz)[dim],
 				 const Ghost<dim,St> & ghost)
-    :domain(dec.getDomain()),ghost(ghost),ghost_int(INVALID_GHOST),dec(dec),v_cl(create_vcluster()),
+    :domain(dec.getDomain()),ghost(ghost),ghost_int(INVALID_GHOST),dec(create_vcluster()),v_cl(create_vcluster()),
 	 ginfo(g_sz),ginfo_v(g_sz)
 	{
 #ifdef SE_CLASS2
@@ -1368,10 +1369,33 @@ public:
 	:domain(dec.getDomain()),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()),
 	 ginfo(g_sz),ginfo_v(g_sz)
 	{
-#ifdef SE_CLASS2
-		check_new(this,8,GRID_DIST_EVENT,4);
-#endif
+		InitializeCellDecomposer(g_sz,dec.periodicity());
+
+		ghost = convert_ghost(g,cd_sm);
+		this->dec = dec.duplicate(ghost);
+
+		// an empty
+		openfpm::vector<Box<dim,long int>> empty;
+
+		// Initialize structures
+		InitializeStructures(g_sz,empty,g,false);
+	}
 
+    /*! It constructs a grid of a specified size, defined on a specified Box space, forcing to follow a specified decomposition, and having a specified ghost size
+     *
+     * \param dec Decomposition
+     * \param g_sz grid size on each dimension
+     * \param g Ghost part (given in grid units)
+     *
+     * \warning In very rare case the ghost part can be one point bigger than the one specified
+     *
+     */
+	template<typename Decomposition2>
+	grid_dist_id(const Decomposition2 & dec, const size_t (& g_sz)[dim],
+			     const Ghost<dim,long int> & g)
+	:domain(dec.getDomain()),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()),
+	 ginfo(g_sz),ginfo_v(g_sz)
+	{
 		InitializeCellDecomposer(g_sz,dec.periodicity());
 
 		ghost = convert_ghost(g,cd_sm);
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index e2485f2451594448e57e1cba218fce7812888d71..3e516415a91f7d86224d1cd0dc125036b3f1aff0 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -190,24 +190,24 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output )
 
 BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
 {
-/*	auto & v_cl = create_vcluster();
+	auto & v_cl = create_vcluster();
 
 	if (v_cl.size() > 3){return;}
 
-	size_t sz[2] = {17,17};
+	size_t sz[2] = {370,370};
 	periodicity<2> bc = {PERIODIC,PERIODIC};
 
 	Ghost<2,long int> g(1);
 
 	Box<2,float> domain({0.0,0.0},{1.0,1.0});
 
-	sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist(sz,domain,g,bc);
+	sgrid_dist_id_gpu<2,float,aggregate<float,float>> gdist(sz,domain,g,bc);
 
 	gdist.template setBackgroundValue<0>(666);
 
 	/////// GPU insert + flush
 
-	Box<2,size_t> box({1,1},{15,15});
+	Box<2,size_t> box({1,1},{350,350});
 	auto it = gdist.getGridIterator(box.getKP1(),box.getKP2());
 
 	/////// GPU Run kernel
@@ -224,13 +224,10 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
 			        {
 			        	data.template get<0>() = c + i + j;
 						data.template get<1>() = c + 1000 + i + j;
-						
-						data.template get<2>()[0] = i;
-						data.template get<2>()[1] = j;
 			        }
 			        );
 
-	gdist.template flush<smax_<0>,smax_<1>,smax_<2>>(flush_type::FLUSH_ON_DEVICE);
+	gdist.template flush<smax_<0>,smax_<1>>(flush_type::FLUSH_ON_DEVICE);
 
 	gdist.template deviceToHost<0>();
 
@@ -238,9 +235,85 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
 
 	// Now load
 
-	sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist2(sz,domain,g,bc);
+	sgrid_dist_id_gpu<2,float,aggregate<float,float>> gdist2(sz,domain,g,bc);
+
+	gdist2.load("sgrid_gpu_output_hdf5");
+
+	gdist2.template ghost_get<0,1>(RUN_ON_DEVICE);
+
+	gdist2.deviceToHost<0,1>();
+	gdist.deviceToHost<0,1>();
+
+	bool match = true;
+
+
+	auto it2 = gdist2.getDomainIterator();
+
+	while (it2.isNext())
+	{
+		auto p = it2.get();
+
+		auto key = it2.getGKey(p);
+
+		auto p_xp1 = p.move(0,1);
+		auto p_xm1 = p.move(0,-1);
+		auto p_yp1 = p.move(1,1);
+		auto p_ym1 = p.move(1,-1);
+
+		auto key_xp1 = key.move(0,1);
+		auto key_xm1 = key.move(0,-1);
+		auto key_yp1 = key.move(1,1);
+		auto key_ym1 = key.move(1,-1);
+
+		if (box.isInside(key_xp1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_xp1) == c + key_xp1.get(0) + key_xp1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_xp1) << "   " << c + key_xp1.get(0) + key_xp1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		if (box.isInside(key_xm1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_xm1) == c + key_xm1.get(0) + key_xm1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_xm1) << "   " << c + key_xm1.get(0) + key_xm1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		if (box.isInside(key_yp1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_yp1) == c + key_yp1.get(0) + key_yp1.get(1);
 
-	gdist2.load("sgrid_gpu_output_hdf5");*/
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_yp1) << "   " << c + key_yp1.get(0) + key_yp1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		if (box.isInside(key_ym1.toPoint()))
+		{
+			match &= gdist.template get<0>(p_ym1) == c + key_ym1.get(0) + key_ym1.get(1);
+
+			if (match == false)
+			{
+				std::cout << gdist.template get<0>(p_ym1) << "   " << c + key_ym1.get(0) + key_ym1.get(1) << std::endl;
+				break;
+			}
+		}
+
+		++it2;
+	}
+
+
+	BOOST_REQUIRE_EQUAL(match,true);
 }
 
 void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2])
@@ -1207,4 +1280,63 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv_background )
 	BOOST_REQUIRE_EQUAL(match,true);
 }
 
+BOOST_AUTO_TEST_CASE( grid_dense_to_sparse_conversion )
+{
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	// grid size
+	size_t sz[3];
+	sz[0] = 32;
+	sz[1] = 32;
+	sz[2] = 32;
+
+	// Ghost
+	Ghost<3,long int> g(1);
+
+	periodicity<3> pr = {PERIODIC,PERIODIC,PERIODIC};
+
+	// Distributed grid with id decomposition
+	grid_dist_id<3, float, aggregate<float,float>> g_dist(sz,domain,g,pr);
+
+	auto it = g_dist.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto p = it.get();
+		auto gkey = it.getGKey(p);
+
+		g_dist.template getProp<0>(p) = gkey.get(0) + gkey.get(1) + gkey.get(2);
+		g_dist.template getProp<1>(p) = 3.0*gkey.get(0) + gkey.get(1) + gkey.get(2);
+
+		++it;
+	}
+
+	sgrid_dist_id_gpu<3,float,aggregate<float,float>> sgdist(g_dist.getDecomposition(),sz,g);
+
+	while (it.isNext())
+	{
+		auto p = it.get();
+		auto gkey = it.getGKey(p);
+
+		sgdist.template insertFlush<0>(p) = g_dist.template get<0>(p);
+
+		++it;
+	}
+
+
+	bool check = true;
+
+	while (it.isNext())
+	{
+		auto p = it.get();
+		auto gkey = it.getGKey(p);
+
+		check &= sgdist.template getProp<0>(p) == g_dist.template get<0>(p);
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(check,true);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index 78c4c9c5c72a508c9d2c6716643dc714c3e97e5a..eab71639cc73957b6eba5adcc66d3ed8055b6a0d 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -419,6 +419,17 @@ void remove_marked(vector_type & vd, const int n = 1024)
 
 	// we reuse memory. this give us the possibility to avoid allocation and make the remove faster
 
+	// Reference counter must be set to zero
+
+/*	for (int i = 0 ; i < MAX_NUMER_OF_PROPERTIES ; i++)
+	{
+		for (int j = 0 ; j < exp_tmp2[i].ref() ; j++)
+		{exp_tmp2[i].decRef();}
+	}
+
+	for (int j = 0 ; j < exp_tmp.ref() ; j++)
+	{exp_tmp.decRef();}*/
+
 	vd_pos_new.setMemory(exp_tmp);
 	vd_prp_new.setMemoryArray((CudaMemory *)&exp_tmp2);
 
@@ -436,6 +447,10 @@ void remove_marked(vector_type & vd, const int n = 1024)
 
 	vd.getPosVector().swap_nomode(vd_pos_new);
 	vd.getPropVector().swap_nomode(vd_prp_new);
+
+	// Increment v_pos_new and vd_prp_new memory counters
+
+	vd.setReferenceCounterToOne();
 }
 
 template<unsigned int prp, typename functor, typename particles_type, typename out_type>
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 41b58d3797553927fca2ca10e71fe8b4f3d47497..cdf5e3aca5e11afe4af787b3cbe8d8325ce5af47 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -221,6 +221,37 @@ struct cell_list_selector<vector,comp_host>
 	}
 };
 
+template<typename vector_type>
+struct decrement_memory
+{
+	//! vector
+	vector_type & v;
+
+
+	/*! \brief constructor
+	 *
+	 *
+	 * \param src encapsulated object1
+	 * \param dst encapsulated object2
+	 *
+	 */
+	inline decrement_memory(vector_type & v)
+	:v(v)
+	{};
+
+
+	/*!  \brief It call the copy function for each property
+	 *
+	 * \param t each member
+	 *
+	 */
+	template<typename T>
+	inline void operator()(T& t) const
+	{
+		v.template getMemory<T::value>().decRef();
+	}
+};
+
 /*! \brief Distributed vector
  *
  * This class represent a distributed vector, the distribution of the structure
@@ -624,6 +655,24 @@ public:
 #endif
 	}
 
+	/*! Set reference counter to one
+	 *
+	 */
+	void setReferenceCounterToOne()
+	{
+		for (int i = 0 ; i < v_pos.template getMemory<0>().ref() - 1; i++)
+		{
+			v_pos.template getMemory<0>().decRef();
+		}
+
+		for (int i = 0 ; i < v_prp.template getMemory<0>().ref() - 1; i++)
+		{
+			decrement_memory<decltype(v_prp)> m(v_prp);
+
+			boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(v_prp)::value_type::max_prop>>(m);
+		}
+	}
+
 	/*! \brief remove all the elements
 	 *
 	 *
diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp
index cc42a53e3632d9181c5d14df1b9288bc8aee64dd..b06a0a98c23f50c681a2f77788b01d56aa8535f4 100644
--- a/src/Vector/vector_dist_kernel.hpp
+++ b/src/Vector/vector_dist_kernel.hpp
@@ -12,7 +12,7 @@ constexpr unsigned int POS_PROP = (unsigned int)-1;
 
 #ifdef CUDA_GPU
 
-#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= static_cast<int>(vd.size_local())) {return;};
+#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= static_cast<unsigned int>(vd.size_local())) {return;};
 #define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\
 							  else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);}