diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c069d5c1dc267c07e8c57e9e6b2403406c9793d..72ceab9a911a4bedf6fb9853784af799c1436b34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ if (POLICY CMP0074)
         cmake_policy(SET CMP0074 OLD)
 endif ()
 
-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/ /opt/rocm/hip/cmake)
 
 set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST")
 set(LIBHILBERT_ROOT CACHE PATH "LibHilbert root path")
@@ -14,13 +14,10 @@ set(SE_CLASS3 CACHE BOOL "Activate compilation with SE_CLASS3")
 set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compiler is found")
 set(TEST_PERFORMANCE CACHE BOOL "Enable test performance")
 set(ALPAKA_ROOT CACHE PATH "Alpaka root path")
-set(CUDA_ON_CPU CACHE STRING "Make Cuda work on heap")
+set(HIP_ENABLE CACHE BOOL "Enable HIP compiler")
+set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels")
 
-if (ENABLE_GPU)
-    set(CUDA_ON_CPU OFF)
-    enable_language(CUDA)
-    find_package(CUDA)
-endif()
+add_subdirectory (../openfpm_devices/ openfpm_devices)
 
 set (CMAKE_CXX_STANDARD 14)
 set (CMAKE_CUDA_STANDARD 14)
@@ -31,7 +28,8 @@ message("Searching Vc in ${Vc_DIR}")
 find_package(Boost 1.72.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context)
 find_package(LibHilbert REQUIRED)
 find_package(Vc REQUIRED)
-
+find_package(HIP)
+find_package(OpenMP)
 
 ###### CONFIG.h FILE ######
 
@@ -47,6 +45,11 @@ if(TEST_PERFORMANCE)
         set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST")
 endif()
 
+if(HIP_FOUND)
+        set(DEFINE_HIP_GPU "#define HIP_GPU")
+        set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP")
+endif()
+
 if (Boost_FOUND)
         set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
         set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
@@ -73,6 +76,10 @@ if(CUDA_ON_CPU)
         set(DEFINE_CUDA_GPU "#define CUDA_GPU")
 endif()
 
+if(HIP_FOUND)
+	set(DEFINE_CUDA_GPU "#define CUDA_GPU")
+endif()
+
 if(LIBHILBERT_FOUND)
         set(DEFINE_HAVE_LIBHILBERT "#define HAVE_LIBHILBERT 1")
 else()
@@ -82,9 +89,20 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/config/config_cmake.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/config/config.h)
 
 
-add_subdirectory (../openfpm_devices/ openfpm_devices)
-
 set(WARNING_SUPPRESSION_AND_OPTION_NVCC ${WARNING_SUPPRESSION_AND_OPTION_NVCC} PARENT_SCOPE)
 set(WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT ${WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT} PARENT_SCOPE)
 
 add_subdirectory (src)
+
+set(DEFINE_HAVE_BOOST ${DEFINE_HAVE_BOOST} PARENT_SCOPE)
+set(DEFINE_HAVE_BOOST_IOSTREAMS ${DEFINE_HAVE_BOOST_IOSTREAMS} PARENT_SCOPE)
+set(DEFINE_HAVE_BOOST_PROGRAM_OPTIONS ${DEFINE_HAVE_BOOST_PROGRAM_OPTIONS} PARENT_SCOPE)
+set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK ${DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK} PARENT_SCOPE)
+set(DEFINE_HAVE_BOOST_CONTEXT ${DEFINE_HAVE_BOOST_CONTEXT} PARENT_SCOPE)
+set(DEFINE_HAVE_BOOST_FIBER ${DEFINE_HAVE_BOOST_FIBER} PARENT_SCOPE)
+set(DEFINE_HAVE_OPENMP ${DEFINE_HAVE_OPENMP} PARENT_SCOPE)
+set(DEFINE_HAVE_ALPAKA ${DEFINE_HAVE_ALPAKA} PARENT_SCOPE)
+set(DEFINE_CUDA_GPU ${DEFINE_CUDA_GPU} PARENT_SCOPE)
+set(DEFINE_CUDIFY_BACKEND ${DEFINE_CUDIFY_BACKEND} PARENT_SCOPE)
+set(OPTIONAL_BOOST_LIBS ${OPTIONAL_BOOST_LIBS} PARENT_SCOPE)
+
diff --git a/build.sh b/build.sh
index 004afcdee2119b29e69f6ca8a4609c1355002ca2..6a9b7ec3c44ddf6eb63477a83fd0b8d842e8c7b0 100755
--- a/build.sh
+++ b/build.sh
@@ -7,7 +7,6 @@ hostname=$(hostname)
 type_compile=$3
 branch=$4
 
-
 echo "Build on: $hostname with $type_compile branch: $branch"
 
 if [ x"$hostname" == x"cifarm-centos-node.mpi-cbg.de"  ]; then
@@ -30,6 +29,8 @@ if [ ! -d $HOME/openfpm_dependencies/openfpm_data/VCDEVEL ]; then
         ./install_VCDEVEL.sh $HOME/openfpm_dependencies/openfpm_data/ 4
 fi
 
+rm -rf $HOME/openfpm_dependencies/openfpm_data/BOOST
+
 if [ ! -d $HOME/openfpm_dependencies/openfpm_data/BOOST ]; then
 	if [ x"$hostname" == x"cifarm-mac-node" ]; then
 		echo "Compiling for OSX"
@@ -40,6 +41,8 @@ if [ ! -d $HOME/openfpm_dependencies/openfpm_data/BOOST ]; then
 	fi
 fi
 
+./install_CMAKE_on_CI.sh $HOME/openfpm_dependencies/openfpm_data 4
+
 mkdir /tmp/openfpm_data_$type_compile
 mv * .[^.]* /tmp/openfpm_data_$type_compile
 mv /tmp/openfpm_data_$type_compile openfpm_data
@@ -56,7 +59,13 @@ cd "$workspace/openfpm_data"
 pre_command=""
 sh ./autogen.sh
 options="$options --disable-gpu "
-options="$options --with-vcdevel=$HOME/openfpm_dependencies/openfpm_data/VCDEVEL --with-boost=$HOME/openfpm_dependencies/openfpm_data/BOOST  --with-libhilbert=$HOME/openfpm_dependencies/openfpm_data/LIBHILBERT --enable-cuda_on_cpu"
+options="$options --with-vcdevel=$HOME/openfpm_dependencies/openfpm_data/VCDEVEL --with-boost=$HOME/openfpm_dependencies/openfpm_data/BOOST  --with-libhilbert=$HOME/openfpm_dependencies/openfpm_data/LIBHILBERT"
+
+if [ x"$hostname" == x"cifarm-mac-node" ]; then
+	options="$options --enable-cuda-on-cpu"
+else
+	options="$options --with-cuda-on-backend=OpenMP"
+fi
 
 if [ x"$3" == x"SE"  ]; then
   options="$options --enable-se-class1 --enable-se-class2 --enable-se-class3 --with-action-on-error=throw --enable-test-coverage"
diff --git a/configure b/configure
index 878cf86f1a54d6824e530ffe53afc8a1882ac1c2..29474677fd8291e1521bd90ee5887a112929ccfe 100755
--- a/configure
+++ b/configure
@@ -100,6 +100,7 @@ enable_debug
 with_metis
 with_hdf5
 with_libhilbert
+with_cuda_on_backend
 enable_cuda_on_cpu
 enable_scan_coverty
 enable_test_performance
@@ -123,6 +124,8 @@ with_eigen
 with_vcdevel
 enable_gpu
 enable_asan
+enable_garbageinj
+enable_garbageinjv
 '
 
 rm -rf build
@@ -240,7 +243,7 @@ do
        conf_options="$conf_options -DSCAN_COVERTY=ON"
        ;;
     cuda_on_cpu)
-       conf_options="$conf_options -DCUDA_ON_CPU=ON"
+       conf_options="$conf_options -DCUDA_ON_BACKEND=SEQUENTIAL"
        ;;
     test_performance)
        conf_options="$conf_options -DTEST_PERFORMANCE=ON"
@@ -261,6 +264,12 @@ do
     asan)
        conf_options="$conf_options -DENABLE_ASAN=ON"
        ;;
+    garbageinj)
+       conf_options="$conf_options -DENABLE_GARBAGE_INJECTOR=ON"
+       ;;
+    garbageinjv)
+       conf_options="$conf_options -DENABLE_VCLUSTER_GARBAGE_INJECTOR=ON"
+       ;;
     *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
        ac_unrecognized_sep=', '
        ;;
@@ -466,6 +475,9 @@ do
     ac_useropt_orig=$ac_useropt
     ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_useropt in
+      cuda_on_backend)
+      conf_options="$conf_options -DCUDA_ON_BACKEND=$ac_optarg"
+      ;;
       libhilbert)
       conf_options="$conf_options -DLIBHILBERT_ROOT=$ac_optarg"
       ;;
@@ -578,6 +590,7 @@ cd build
 
 ## remove enerything
 echo "Calling cmake ../. $conf_options"
+printf "cmake ../. $conf_options" > cmake_build_options 
 rm ../error_code
 DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. $conf_options
 if [ $? != 0 ]; then
@@ -606,6 +619,7 @@ clean:
 
 install:
 	\$(MAKE) -C build \$@
+	script/install_parallel_debugger
 
 pdata:
 	\$(MAKE) -C build \$@
diff --git a/discover_os b/discover_os
new file mode 100755
index 0000000000000000000000000000000000000000..0a5742ed9bba0f52c0fd1d39226159b793e61762
--- /dev/null
+++ b/discover_os
@@ -0,0 +1,39 @@
+#! /bin/bash
+
+function discover_os() {
+platform=unknown
+arch=$(uname -m)
+
+  if [[ "$OSTYPE" == "linux-gnu" ]]; then
+        echo -e "We are on\033[1;34m LINUX \033[0m, with architecture \033[1;34m$arch\033[0m"
+        platform=linux
+  elif [[ "$OSTYPE" == "linux" ]]; then
+        echo -e "We are on\033[1;34m LINUX \033[0m, with architecture \033[1;34m$arch\033[0m"
+        platform=linux
+  elif [[ "$OSTYPE" == "darwin"* ]]; then
+        echo -e "We are on\033[1;34m MAC OSX \033[0m, with architecture \033[1;34m$arch\033[0m"
+        platform=osx
+  elif [[ "$OSTYPE" == "cygwin" ]]; then
+        echo -e "We are on\033[1;34m CYGWIN \033[0m, with architecture \033[1;34m$arch\033[0m"
+	platform=cygwin
+  elif [[ "$OSTYPE" == "msys" ]]; then
+        echo -e "We are on\033[1;34m Microsoft Window \033[0m, with architecture \033[1;34m$arch\033[0m"
+        echo "This platform is not supported"
+        exit 1
+  elif [[ "$OSTYPE" == "win32" ]]; then
+        echo -e "We are on\033[1;34m Microsoft Window \033[0m, with architecture \033[1;34m$arch\033[0m"
+        echo "This platform is not supported"
+        exit 1
+  elif [[ "$OSTYPE" == "freebsd"* ]]; then
+        echo -e "We are on\033[1;34m FREEBSD \033[0m, with architecture \033[1;34m$arch\033[0m"
+        echo "This platform is not supported"
+        exit 1
+  else
+        echo -e "We are on an\033[1;34m unknown OS \033[0m, with architecture \033[1;34m$arch\033[0m"
+        echo "This platform is not supported"
+        exit 1
+  fi
+
+
+}
+
diff --git a/install_BOOST.sh b/install_BOOST.sh
index 723da2c4458ce197211398b6687db9e0bad50675..d080b347e8710c4caa5098d820ec6b15796827d0 100755
--- a/install_BOOST.sh
+++ b/install_BOOST.sh
@@ -1,5 +1,8 @@
 #!/bin/bash 
 
+source discover_os
+discover_os
+
 # check if the directory $1/BOOST exist
 
 if [ -d "$1/BOOST" ]; then
@@ -7,9 +10,10 @@ if [ -d "$1/BOOST" ]; then
   exit 0
 fi
 
-wget http://ppmcore.mpi-cbg.de/upload/boost_1_72_0.tar.bz2
-tar -xvf boost_1_72_0.tar.bz2
-cd boost_1_72_0
+rm boost_1_75_0.tar.bz2
+wget http://ppmcore.mpi-cbg.de/upload/boost_1_75_0.tar.bz2
+tar -xvf boost_1_75_0.tar.bz2
+cd boost_1_75_0
 if [ x"$4" != x"" ]; then
 	if [ -f $HOME/user-config.jam ]; then
 		mv $HOME/user-config.jam $HOME/user-config.jam_bck
@@ -22,11 +26,25 @@ if [ x"$4" != x"" ]; then
 fi
 ./bootstrap.sh --with-toolset=$3
 mkdir $1/BOOST
-./b2 -j $2 install --prefix=$1/BOOST
-rm -rf boost_1_72_0
+# Several flavours
+if [ x"$platform" == x"osx" ]; then
+    if [ x"$arch" == x"arm64" ]; then
+        if [ x"$3" == x"" ]; then
+            ./b2 -j $2 install --prefix=$1/BOOST address-model=64 architecture=arm abi=aapcs binary-format=mach-o toolset=clang
+        else
+            ./b2 -j $2 install --prefix=$1/BOOST address-model=64 architecture=arm abi=aapcs binary-format=mach-o toolset=$3
+        fi
+    else
+        ./b2 -j $2 install --prefix=$1/BOOST address-model=64 architecture=x86 abi=sysv binary-format=mach-o toolset=clang
+    fi
+else
+    ./b2 -j $2 install --prefix=$1/BOOST
+fi
+
+rm -rf boost_1_75_0
 
 if [ -f $HOME/user-config.jam_bck ]; then
 	mv $HOME/user-config.jam_bck $HOME/user-config.jam
 fi
-rm -rf boost_1_72_0.tar.bz2
+rm -rf boost_1_75_0.tar.bz2
 
diff --git a/install_CMAKE_on_CI.sh b/install_CMAKE_on_CI.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d7492ad4088d8737727f255b2b042c06f6edd4a1
--- /dev/null
+++ b/install_CMAKE_on_CI.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+if [ -d "$1/CMAKE" ]; then
+   exit 0
+fi
+
+wget https://github.com/Kitware/CMake/releases/download/v3.20.3/cmake-3.20.3.tar.gz
+tar -xvf cmake-3.20.3.tar.gz
+cd cmake-3.20.3
+
+./bootstrap --prefix=$1/CMAKE
+make
+make install
+
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 622e5cf14eab8af3008f1f3f5d3edabd3a5c50d7..4db6568e8a3fb87539b6a66c26e8dd9e28f45bca 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,7 +21,7 @@ if (TEST_PERFORMANCE)
                          Vector/performance/vector_performance_test.cu)
 endif ()
 
-if (CUDA_FOUND OR CUDA_ON_CPU)
+if (NOT CUDA_ON_BACKEND STREQUAL "None")
 	set(CUDA_SOURCES ${CUDA_SOURCES}
 	    Vector/map_vector_sparse_unit_tests.cu
             Vector/vector_gpu_unit_tests.cu
@@ -45,31 +45,55 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
 	add_definitions("-DBOOST_MPL_CFG_HAS_TYPEOF")
 endif()
 
-if (CUDA_ON_CPU)
-        add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 )
-	#        add_definitions(-D_GLIBCXX_PARALLEL)
+if (CUDA_ON_BACKEND STREQUAL "OpenMP" OR CUDA_ON_BACKEND STREQUAL "SEQUENTIAL")
+        set_source_files_properties(${CUDA_SOURCES} PROPERTIES COMPILE_FLAGS "-D__NVCC__ -DCUDART_VERSION=11000")
         set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
-	set_source_files_properties(isolation.cu PROPERTIES LANGUAGE CXX)
-       	if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-        	add_definitions("-x c++")
-        endif()
-        if (CUDA_ON_CPU STREQUAL "OpenMP")
-                add_definitions(-DCUDIFY_USE_OPENMP)
+        if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+                add_definitions("-x c++")
         endif()
 endif()
 
-add_executable(mem_map ${CUDA_SOURCES}
-        main.cpp
-        data_type/aggregate_unit_tests.cpp
-        util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
-        memory_ly/memory_conf_unit_tests.cpp
-        Space/tests/SpaceBox_unit_tests.cpp
-        Space/Shape/Sphere_unit_test.cpp
+if ( HIP_ENABLE AND HIP_FOUND )
+
+	list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
+
+	if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+		list(APPEND HIP_HIPCC_FLAGS -O0)
+	endif()
+
+	list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__  -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0)
+	set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
+
+	set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+        hip_add_executable(mem_map ${CUDA_SOURCES}
+                main.cpp
+                data_type/aggregate_unit_tests.cpp
+                util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
+                memory_ly/memory_conf_unit_tests.cpp
+                Space/tests/SpaceBox_unit_tests.cpp
+                Space/Shape/Sphere_unit_test.cpp
+                SparseGrid/SparseGrid_unit_tests.cpp
+                SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp
+                Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
+                Grid/Geometry/tests/grid_smb_tests.cpp)
+
+else()
+
+	add_executable(mem_map ${CUDA_SOURCES}
+        	main.cpp
+        	data_type/aggregate_unit_tests.cpp
+        	util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
+        	memory_ly/memory_conf_unit_tests.cpp
+        	Space/tests/SpaceBox_unit_tests.cpp
+        	Space/Shape/Sphere_unit_test.cpp
 		SparseGrid/SparseGrid_unit_tests.cpp
 		SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp
-        Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
+        	Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
 		Grid/Geometry/tests/grid_smb_tests.cpp)
 
+endif()
+
 set_property(TARGET mem_map PROPERTY CUDA_ARCHITECTURES 60 75)
 
 if (CUDA_FOUND)
@@ -84,6 +108,11 @@ endif()
 
 add_dependencies(mem_map ofpmmemory)
 
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+	target_compile_options(mem_map PRIVATE "-Wno-undefined-var-template")
+	target_compile_options(mem_map PRIVATE "-Wno-macro-redefined")
+endif()
+
 if (CMAKE_COMPILER_IS_GNUCC)
     target_compile_options(mem_map PRIVATE "-Wno-deprecated-declarations")
     target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -mavx>)
@@ -157,6 +186,7 @@ target_link_libraries(mem_map ${Boost_LIBRARIES})
 target_link_libraries(mem_map -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
 target_link_libraries(mem_map ofpmmemory)
 target_link_libraries(mem_map ${Vc_LIBRARIES})
+target_link_libraries(mem_map OpenMP::OpenMP_CXX)
 
 if (CUDA_FOUND)
 	target_link_libraries(isolation ${Boost_LIBRARIES})
@@ -405,12 +435,13 @@ install(FILES util/multi_array_openfpm/array_openfpm.hpp
 	COMPONENT OpenFPM)
 
 
-install(FILES util/cuda/scan_cuda.cuh
-        util/cuda/ofp_context.hxx
+install(FILES util/cuda/ofp_context.hxx
         util/cuda/kernels.cuh
         util/cuda/scan_ofp.cuh
 	util/cuda/sort_ofp.cuh
 	util/cuda/reduce_ofp.cuh
+	util/cuda/segreduce_ofp.cuh
+	util/cuda/merge_ofp.cuh
         DESTINATION openfpm_data/include/util/cuda
 	COMPONENT OpenFPM)
 
diff --git a/src/Grid/cuda/cuda_grid_gpu_funcs.cuh b/src/Grid/cuda/cuda_grid_gpu_funcs.cuh
index f28b58bda50106d1e5c2dd3596a142fefd67f104..b86d80d061b44504a5cc683d618bd2716b986c26 100644
--- a/src/Grid/cuda/cuda_grid_gpu_funcs.cuh
+++ b/src/Grid/cuda/cuda_grid_gpu_funcs.cuh
@@ -8,8 +8,11 @@
 #ifndef CUDA_GRID_GPU_FUNCS_CUH_
 #define CUDA_GRID_GPU_FUNCS_CUH_
 
+#include "config.h"
+#include "util/cuda_launch.hpp"
 #include "map_grid_cuda_ker.cuh"
 
+
 #if defined(CUDA_GPU) && defined(__NVCC__)
 
 template<unsigned int dim, typename grid_type>
@@ -102,7 +105,7 @@ struct grid_toKernelImpl
 		g.get_data_().disable_manage_memory();
 		g.get_data_().mem = gc.get_internal_data_().mem;
 		// Increment the reference of mem
-		g.get_data_().mem->incRef();
+		//g.get_data_().mem->incRef();
 		g.get_data_().mem_r.bind_ref(gc.get_internal_data_().mem_r);
 		g.get_data_().switchToDevicePtr();
 
diff --git a/src/Grid/cuda/cuda_grid_gpu_tests.cu b/src/Grid/cuda/cuda_grid_gpu_tests.cu
index d26103e01083762fc1eb377aac9c886df228bda4..bb0bb4af8e2a0cad0e04c1287d44b38785f9df65 100644
--- a/src/Grid/cuda/cuda_grid_gpu_tests.cu
+++ b/src/Grid/cuda/cuda_grid_gpu_tests.cu
@@ -32,6 +32,18 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
 
 	auto gcf = c3.getGPUIterator(k1,k2);
 
+#ifdef __HIP__
+
+	BOOST_REQUIRE_EQUAL(gcf.thr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.thr.z,4ul);
+
+	BOOST_REQUIRE_EQUAL(gcf.wthr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.wthr.z,16ul);
+
+#else
+
 	BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul);
 	BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
 	BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul);
@@ -40,12 +52,26 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
 	BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
 	BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul);
 
+#endif
+
 	grid_key_dx<3> k3({50,50,50});
 	grid_key_dx<3> k4({62,62,62});
 	grid_key_dx<3> k5({60,61,62});
 
 	auto gcf2 = c3.getGPUIterator(k3,k4);
 
+#ifdef __HIP__
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul);
+
+#else
+
 	BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul);
 	BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
 	BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul);
@@ -54,6 +80,8 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
 	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
 	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul);
 
+#endif
+
 	gcf2 = c3.getGPUIterator(k3,k4,511);
 
 	BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
diff --git a/src/Grid/cuda/map_grid_cuda_ker.cuh b/src/Grid/cuda/map_grid_cuda_ker.cuh
index b638288614dbdf13fb3cdb6e84215133edc781f0..442507b12d7366da5ab03d77c7f027a3e125243d 100644
--- a/src/Grid/cuda/map_grid_cuda_ker.cuh
+++ b/src/Grid/cuda/map_grid_cuda_ker.cuh
@@ -303,7 +303,8 @@ public:
 	 * \return an encap_c that is the representation of the object (careful is not the object)
 	 *
 	 */
-	__device__ inline encapc<dim,T_,layout> get_o(const grid_key_dx<dim> & v1)
+	template<typename Tk>
+	__device__ inline encapc<dim,T_,layout> get_o(const grid_key_dx<dim,Tk> & v1)
 	{
 #ifdef SE_CLASS1
 		if (check_bound(v1) == false)
@@ -323,7 +324,8 @@ public:
 	 * \return an encap_c that is the representation of the object (careful is not the object)
 	 *
 	 */
-	__device__ inline const encapc<dim,T_,layout> get_o(const grid_key_dx<dim> & v1) const
+	template<typename Tk>
+	__device__ inline const encapc<dim,T_,layout> get_o(const grid_key_dx<dim,Tk> & v1) const
 	{
 #ifdef SE_CLASS1
 		if (check_bound(v1) == false)
@@ -424,7 +426,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) const
+	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
 	}
diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp
index 9ff639d69398fdc6012962817e54e42937294ee7..889b713237bc217fdd994c4d132ca22262c1c7f1 100644
--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -70,6 +70,37 @@ struct skip_init<true,T>
 
 #ifdef __NVCC__
 
+template<bool active>
+struct copy_ndim_grid_device_active_impl
+	{
+	template<typename grid_type1, typename grid_type2, typename ite_gpu_type>
+	static inline void copy(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite)
+	{
+
+	}
+
+	template<typename grid_type1, typename grid_type2, typename ite_gpu_type>
+	static inline void copy_block(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite)
+	{
+	}
+};
+
+template<>
+struct copy_ndim_grid_device_active_impl<true>
+{
+	template<typename grid_type1, typename grid_type2, typename ite_gpu_type>
+	static inline void copy(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite)
+	{
+		CUDA_LAUNCH((copy_ndim_grid_device<grid_type1::dims,decltype(g1.toKernel())>),ite,g2.toKernel(),g1.toKernel());
+	}
+
+	template<typename grid_type1, typename grid_type2, typename ite_gpu_type>
+	static inline void copy_block(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite)
+	{
+		CUDA_LAUNCH((copy_ndim_grid_block_device<grid_type1::dims,decltype(g1.toKernel())>),ite,g2.toKernel(),g1.toKernel());
+	}
+};
+
 template<unsigned int dim, typename ids_type = int>
 struct grid_p
 {
@@ -177,27 +208,6 @@ struct grid_p<1,ids_type>
 
 #endif
 
-template<unsigned int dim>
-bool has_work_gpu(ite_gpu<dim> & ite)
-{
-	size_t tot_work = 1;
-
-	if (dim == 1)
-	{tot_work *= ite.wthr.x * ite.thr.x;}
-	else if(dim == 2)
-	{
-		tot_work *= ite.wthr.x * ite.thr.x;
-		tot_work *= ite.wthr.y * ite.thr.y;
-	}
-	else
-	{
-		tot_work *= ite.wthr.x * ite.thr.x;
-		tot_work *= ite.wthr.y * ite.thr.y;
-		tot_work *= ite.wthr.z * ite.thr.z;
-	}
-
-	return tot_work != 0;
-}
 
 template<unsigned int dim>
 void move_work_to_blocks(ite_gpu<dim> & ite)
@@ -459,6 +469,9 @@ private:
 	{
 #if defined(CUDA_GPU) && defined(__NVCC__)
 
+			// Compile time-cheking that make sense to call a GPU kernel to copy.
+			
+
 			grid_key_dx<dim> start;
 			grid_key_dx<dim> stop;
 
@@ -488,7 +501,7 @@ private:
 				{
 					if (blockSize == 1)
 					{
-						CUDA_LAUNCH((copy_ndim_grid_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel());
+						copy_ndim_grid_device_active_impl<S::isDeviceHostSame() == false>::copy(grid_new,*this,ite);
 					}
 					else
 					{
@@ -496,7 +509,7 @@ private:
 
 						ite.thr.x = blockSize;
 
-						CUDA_LAUNCH((copy_ndim_grid_block_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel());
+						copy_ndim_grid_device_active_impl<S::isDeviceHostSame() == false>::copy_block(grid_new,*this,ite);
 					}
 				}
 			}
@@ -514,7 +527,7 @@ private:
 
 				auto ite = getGPUIterator_impl<1>(g_sm_copy,start,stop);
 
-				CUDA_LAUNCH((copy_ndim_grid_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel());
+				copy_ndim_grid_device_active_impl<S::isDeviceHostSame() == false>::copy(grid_new,*this,ite);
 			}
 #else
 
@@ -730,7 +743,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = 1024) const
+	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
 	}
@@ -875,6 +888,24 @@ public:
 		return layout_base<T>::template get<p>(data_,g1,v1);
 	}
 
+	/*! \brief No blocks here, it return 1
+     * 
+	 * \return 1
+	 */
+	int getBlockEdgeSize()
+	{
+		return 1;
+	}
+
+	/*! \brief No blocks here, it does nothing
+     *
+	 *  \param nb unused
+	 *  \param nt unused
+	 * 
+	 */
+	void setGPUInsertBuffer(unsigned int nb, unsigned int nt)
+	{}
+
 	/*! \brief Get the const reference of the selected element
 	 *
 	 * \param v1 grid_key that identify the element in the grid
diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp
index 5dd194abb53ee42534304721bf48698164d75137..2365757ba6f4b85a2ebcaab9835dab98e58431ce 100644
--- a/src/Grid/grid_key.hpp
+++ b/src/Grid/grid_key.hpp
@@ -441,7 +441,7 @@ public:
 	 * \return a string
 	 *
 	 */
-	std::string to_string()
+	std::string to_string() const
 	{
 		return this->toPointS().toString();
 	}
diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index ed1e1d9ea776d2e2d4b3c95cf1fb606d0c043bc5..a0588a177acc4edaedaa77916bb1f39c39ae0737 100755
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -121,11 +121,33 @@ struct ite_gpu
 #endif
 };
 
+template<unsigned int dim>
+bool has_work_gpu(ite_gpu<dim> & ite)
+{
+	size_t tot_work = 1;
+
+	if (dim == 1)
+	{tot_work *= ite.wthr.x * ite.thr.x;}
+	else if(dim == 2)
+	{
+		tot_work *= ite.wthr.x * ite.thr.x;
+		tot_work *= ite.wthr.y * ite.thr.y;
+	}
+	else
+	{
+		tot_work *= ite.wthr.x * ite.thr.x;
+		tot_work *= ite.wthr.y * ite.thr.y;
+		tot_work *= ite.wthr.z * ite.thr.z;
+	}
+
+	return tot_work != 0;
+}
+
 //! Declaration grid_sm
 template<unsigned int N, typename T> class grid_sm;
 
-template<unsigned int dim, typename T2, typename T>
-ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = 1024);
+template<unsigned int dim, typename grid_sm_type, typename T>
+ite_gpu<dim> getGPUIterator_impl(const grid_sm_type & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_);
 
 //! Declaration print_warning_on_adjustment
 template <unsigned int dim, typename linearizer> class print_warning_on_adjustment;
@@ -733,7 +755,7 @@ public:
 		return grid_key_dx_iterator_sub<N>(*this,start,stop);
 	}
 
-#ifdef CUDA_GPU
+#if defined(CUDA_GPU)
 
 	/*! \brief Get an iterator for the GPU
 	 *
@@ -742,7 +764,7 @@ public:
 	 *
 	 */
 	template<typename T2>
-	struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const
+	struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<N>(*this,key1,key2,n_thr);
 	}
@@ -753,7 +775,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const
+	struct ite_gpu<N> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
 	{
 		grid_key_dx<N> k1;
 		grid_key_dx<N> k2;
@@ -812,8 +834,8 @@ public:
 };
 
 
-template<unsigned int dim, typename T2, typename T>
-ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, const size_t n_thr)
+template<unsigned int dim, typename grid_sm_type, typename T>
+ite_gpu<dim> getGPUIterator_impl(const grid_sm_type & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, const size_t n_thr)
 {
 	size_t tot_work = 1;
 	for (size_t i = 0 ; i < dim ; i++)
diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp
index e6965bae03acbbba006cfbde63c92b5e731a2289..b2fa40e037e74cc1b10b5ce948ada6d6cf50a06f 100755
--- a/src/Grid/map_grid.hpp
+++ b/src/Grid/map_grid.hpp
@@ -59,6 +59,15 @@
 typedef HeapMemory CudaMemory;
 #endif
 
+/*! \brief get the type of the SetBlock
+ *
+ *
+ */
+template<typename SGridGpu>
+struct GetSetBlockType
+{
+	typedef typename SGridGpu::device_grid_type::container type;
+};
 
 /*! Stub grid class
  *
@@ -185,12 +194,28 @@ public:
 				rem_copy_opt opt = rem_copy_opt::NONE_OPT)
 	{}
 
+#if defined(__HIP__)
+
+	/*! \brief It copy a grid
+	 *
+	 * \param g grid to copy
+	 *
+	 */
+	__device__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
+	{
+		printf("Error grid_base operator= is not defined in device code\n");
+
+		return *this;
+	}
+
+#endif
+
 	/*! \brief It copy a grid
 	 *
 	 * \param g grid to copy
 	 *
 	 */
-	grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
+	__host__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
 	{
 		(static_cast<grid_base_impl<dim,T,S, memory_traits_lin> *>(this))->swap(g.duplicate());
 
diff --git a/src/Grid/performance/grid_performance_tests.hpp b/src/Grid/performance/grid_performance_tests.hpp
index e2878f28bfbc18683ae6a64d9af0e109e494dcb5..e3762c181afa51fe85af4e9b507afaad24df05c8 100644
--- a/src/Grid/performance/grid_performance_tests.hpp
+++ b/src/Grid/performance/grid_performance_tests.hpp
@@ -220,7 +220,8 @@ BOOST_AUTO_TEST_CASE(grid_performance_write_report)
 
 	StandardXMLPerformanceGraph("grid_performance_funcs.xml",file_xml_ref,cg);
 
-	addUpdtateTime(cg,1);
+	addUpdateTime(cg,1,"data","grid_performance_funcs");
+	createCommitFile("data");
 
 	cg.write("grid_performance_funcs.html");
 }
diff --git a/src/NN/CellList/CellListIterator_test.hpp b/src/NN/CellList/CellListIterator_test.hpp
index 5e8266d0a98968f8f80ea02f5da1f264ffd19c7b..e20740ee174e8de948343a8d30c9b2ccc95d4f4e 100644
--- a/src/NN/CellList/CellListIterator_test.hpp
+++ b/src/NN/CellList/CellListIterator_test.hpp
@@ -12,6 +12,11 @@
 #include "NN/CellList/ParticleIt_Cells.hpp"
 #include "NN/CellList/ParticleItCRS_Cells.hpp"
 
+#ifdef OPENFPM_PDATA
+#include "VCluster/VCluster.hpp"
+#endif
+
+
 /*! \brief Fill the cell-list with particles in the box 0.0,1.0
  *
  * \param k Number of particles
@@ -96,6 +101,18 @@ BOOST_AUTO_TEST_CASE( celllist_lin_and_iterator_test )
 
 BOOST_AUTO_TEST_CASE( celllist_hilb_and_iterator_test )
 {
+#ifdef OPENFPM_PDATA
+
+	auto & v_cl = create_vcluster();
+
+	std::string c2 = std::string("openfpm_data/test_data/NN_hilb_keys");
+
+#else
+
+	std::string c2 = std::string("test_data/NN_hilb_keys");
+
+#endif
+
 	///////// INPUT DATA //////////
 
 	const size_t dim = 3;
@@ -145,7 +162,7 @@ BOOST_AUTO_TEST_CASE( celllist_hilb_and_iterator_test )
 
 	openfpm::vector<size_t> keys_old;
 
-	keys_old.load("test_data/NN_hilb_keys");
+	keys_old.load(c2);
 
 	for (size_t i = 0; i < keys_old.size(); i++)
 	{
diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu
index 5be0523cdee78b88d2bcb249db293b2f8ff2bad2..55ad9a0517b87ecfb372da57c8b7febdac72470e 100644
--- a/src/NN/CellList/CellList_gpu_test.cu
+++ b/src/NN/CellList/CellList_gpu_test.cu
@@ -16,7 +16,6 @@
 #include "CellList.hpp"
 #include "util/boost/boost_array_openfpm.hpp"
 #include  "Point_test.hpp"
-#include "util/cuda/scan_cuda.cuh"
 #include "util/cuda_util.hpp"
 
 BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
@@ -1363,6 +1362,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 
 	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
 	cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
+
 	auto & s_t_ns = cl2.getSortToNonSort();
 
 	pl.template hostToDevice<0>();
@@ -1371,6 +1371,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 
 	// Domain particles
 
+	
 	auto & gdsi = cl2.getDomainSortIds();
 	gdsi.template deviceToHost<0>();
 	s_t_ns.template deviceToHost<0>();
@@ -1497,6 +1498,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 	}
 
 	BOOST_REQUIRE_EQUAL(check,true);
+
 	}
 }
 
diff --git a/src/NN/CellList/CellList_test.hpp b/src/NN/CellList/CellList_test.hpp
index 1336823e542b51224b64ac7662070711aa3c754f..e93bb76f1cc342c05be9be9c5ceec312f320af0e 100644
--- a/src/NN/CellList/CellList_test.hpp
+++ b/src/NN/CellList/CellList_test.hpp
@@ -395,7 +395,7 @@ template<unsigned int dim, typename T, typename CellS> void Test_NN_iterator_rad
 
 		for (size_t i = 0 ; i < dim ; i++)
 		{
-			vrp.template get<0>(j)[i] = ((float)rand() / RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i);
+			vrp.template get<0>(j)[i] = ((float)rand() / (float)RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i);
 		}
 	}
 
diff --git a/src/Packer_Unpacker/Packer.hpp b/src/Packer_Unpacker/Packer.hpp
index dde5c85d1fd351f9d95173223488e7c636f26b42..f7d305066e8c82276409805c92771e7d0a5fb303 100644
--- a/src/Packer_Unpacker/Packer.hpp
+++ b/src/Packer_Unpacker/Packer.hpp
@@ -8,19 +8,21 @@
 #ifndef SRC_PACKER_HPP_
 #define SRC_PACKER_HPP_
 
+#include "util/cudify/cudify.hpp"
+
 #include "util/object_util.hpp"
 //#include "Grid/util.hpp"
 #include "Vector/util.hpp"
 #include "memory/ExtPreAlloc.hpp"
 #include "util/util_debug.hpp"
 
-
 #include "Grid/grid_sm.hpp"
 #include "util/Pack_stat.hpp"
 #include "Pack_selector.hpp"
 #include "has_pack_encap.hpp"
 #include "Packer_util.hpp"
 
+
 template <typename> struct Debug;
 
 /*! \brief Packing class
@@ -458,6 +460,33 @@ public:
 
 ///////////////////////////////////
 
+template<typename Timpl>
+struct copy_packer_chunk_arr_impl
+{
+	template<typename T, typename e_src, typename e_dst>
+	static inline void call(const e_src & src, size_t sub_id, e_dst & dst)
+	{
+		// Remove the reference from the type to copy
+		typedef typename boost::remove_reference<decltype(dst.template get< T::value >())>::type copy_rtype;
+
+		meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[sub_id],dst.template get< T::value >());
+	}
+};
+
+template<unsigned int N1, typename Timpl>
+struct copy_packer_chunk_arr_impl<Timpl[N1]>
+{
+	template<typename T, typename e_src, typename e_dst>
+	static inline void call(const e_src & src, size_t sub_id, e_dst & dst)
+	{
+		// Remove the reference from the type to copy
+		typedef typename boost::remove_reference<decltype(dst.template get< T::value >()[0])>::type copy_rtype;
+
+		for (int i = 0 ; i < N1 ; i++)
+		{meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[i][sub_id],dst.template get< T::value >()[i]);}
+	}
+};
+
 //! It copy one element of the chunk for each property
 template<typename e_src, typename e_dst>
 struct copy_packer_chunk
@@ -479,8 +508,7 @@ struct copy_packer_chunk
 	 */
 	inline copy_packer_chunk(const e_src & src, size_t sub_id, e_dst & dst)
 	:src(src),dst(dst),sub_id(sub_id)
-	{
-	};
+	{};
 
 
 
@@ -491,7 +519,9 @@ struct copy_packer_chunk
 		// Remove the reference from the type to copy
 		typedef typename boost::remove_reference<decltype(dst.template get< T::value >())>::type copy_rtype;
 
-		meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[sub_id],dst.template get< T::value >());
+		// meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[sub_id],dst.template get< T::value >());
+
+		copy_packer_chunk_arr_impl<copy_rtype>::template call<T>(src,sub_id,dst);
 	}
 };
 
diff --git a/src/Packer_Unpacker/Packer_util.hpp b/src/Packer_Unpacker/Packer_util.hpp
index 0f0502f2f867e69b70bbe0c401c8defd6e34aba0..7c5c57c2ae9e1996bcbacacaae4484d5bcf58be2 100644
--- a/src/Packer_Unpacker/Packer_util.hpp
+++ b/src/Packer_Unpacker/Packer_util.hpp
@@ -21,6 +21,7 @@ template<typename T, typename Mem, int pack_type=Pack_selector<T>::value > class
 
 #include "Vector/vector_def.hpp"
 
+
 ///////////////////////////////////////////////////////////////////////////////////
 //////////////////////////////// FUNCTORS FOR ENCAP ////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////
@@ -107,6 +108,27 @@ struct call_pack_enc_functor
 	}
 };
 
+template<typename Timp>
+struct call_pack_enc_functor_chunking_impl_arr
+{
+	template<typename T, typename Mem, typename encap>
+	static inline void call(ExtPreAlloc<Mem> & mem,const encap & obj, size_t sub_id, Pack_stat & sts)
+	{
+		Packer<typename Timp::value_type,Mem>::pack(mem,obj.template get<T::value>()[sub_id],sts);
+	}
+};
+
+template<unsigned int N1, typename Timp>
+struct call_pack_enc_functor_chunking_impl_arr<Timp[N1]>
+{
+	template<typename T, typename Mem, typename encap>
+	static inline void call(ExtPreAlloc<Mem> & mem,const encap & obj, size_t sub_id, Pack_stat & sts)
+	{
+		for (int i = 0 ; i < N1 ; i++)
+		{Packer<typename Timp::value_type,Mem>::pack(mem,obj.template get<T::value>()[i][sub_id],sts);}
+	}
+};
+
 //A functor for call_aggregatePack
 template<typename encap, typename Mem>
 struct call_pack_enc_functor_chunking
@@ -132,9 +154,11 @@ struct call_pack_enc_functor_chunking
 	template<typename T>
 	inline void operator()(T& t)
 	{
-		typedef typename boost::mpl::at<typename encap::type,T>::type::value_type obj_type;
+//		typedef typename boost::mpl::at<typename encap::type,T>::type::value_type obj_type;
+
+		call_pack_enc_functor_chunking_impl_arr<typename boost::mpl::at<typename encap::type,T>::type>::template call<T>(mem,obj,sub_id,sts);
 
-		Packer<obj_type,Mem>::pack(mem,obj.template get<T::value>()[sub_id],sts);
+//		Packer<obj_type,Mem>::pack(mem,obj.template get<T::value>()[sub_id],sts);
 	}
 };
 
@@ -193,6 +217,33 @@ struct call_unpack_encap_functor
 	}
 };
 
+template<typename T>
+struct call_unpack_encap_functor_chunking_array_selector
+{
+	template<typename T_value, typename Mem, typename encap>
+	static inline void unpack(ExtPreAlloc<Mem> & mem, const encap & obj, unsigned int sub_id, Unpack_stat & ps)
+	{
+		typedef typename boost::mpl::at<typename encap::type,T_value>::type::value_type obj_type;
+
+		Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T_value::value>()[sub_id],ps);
+	}
+};
+
+template<typename T, unsigned int N1>
+struct call_unpack_encap_functor_chunking_array_selector<T[N1]>
+{
+	template<typename T_value, typename Mem, typename encap>
+	static inline void unpack(ExtPreAlloc<Mem> & mem, const encap & obj, unsigned int sub_id, Unpack_stat & ps)
+	{
+		for (int i = 0 ; i < N1 ; i++)
+		{
+			typedef typename std::remove_all_extents< typename boost::mpl::at<typename encap::type,T_value>::type >::type::value_type obj_type;
+
+			Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T_value::value>()[i][sub_id],ps);
+		}
+	}
+};
+
 //A functor for call_aggregateUnpack
 template<typename encap, typename Mem, int ... prp>
 struct call_unpack_encap_functor_chunking
@@ -212,9 +263,11 @@ struct call_unpack_encap_functor_chunking
 	template<typename T>
 	inline void operator()(T& t)
 	{
-		typedef typename boost::mpl::at<typename encap::type,T>::type::value_type obj_type;
+		typedef typename boost::mpl::at<typename encap::type,T>::type obj_type;
+
+//		Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T::value>()[sub_id],ps);
 
-		Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T::value>()[sub_id],ps);
+		call_unpack_encap_functor_chunking_array_selector<obj_type>::template unpack<T>(mem,obj,sub_id,ps);
 	}
 };
 
diff --git a/src/Point_test.hpp b/src/Point_test.hpp
index c41e0429ee1a321ab6f80acfa76324fbcc76906f..ede644fa7c19308f6103366d3c3cdcb796b7e1ec 100755
--- a/src/Point_test.hpp
+++ b/src/Point_test.hpp
@@ -335,7 +335,7 @@ public:
 	 * \return this
 	 *
 	 */
-	inline Point_test<T> operator= (const Point_test<T> & p)
+	__device__ __host__ inline Point_test<T> & operator= (const Point_test<T> & p)
 	{
 		boost::fusion::at_c<0>(data) = boost::fusion::at_c<0>(p.data);
 		boost::fusion::at_c<1>(data) = boost::fusion::at_c<1>(p.data);
diff --git a/src/Space/Ghost.hpp b/src/Space/Ghost.hpp
index 15c0af2e70ef26696b2ef25f7cd529a9bb8267c3..c5df897c0c8d33e2b72df4e3155b1cb72725539b 100644
--- a/src/Space/Ghost.hpp
+++ b/src/Space/Ghost.hpp
@@ -45,14 +45,14 @@ public:
 	 * \param g ghost
 	 *
 	 */
-	template <typename S> inline Ghost(const Ghost<dim,S> & g)
+/*	template <typename S> inline Ghost(const Ghost<dim,S> & g)
 	{
 		for (size_t i = 0 ; i < dim ; i++)
 		{
 			this->setLow(i,g.getLow(i));
 			this->setHigh(i,g.getHigh(i));
 		}
-	}
+	}*/
 
 	/*! \brief Constructor from initializer list
 	 *
diff --git a/src/Space/Shape/Point.hpp b/src/Space/Shape/Point.hpp
index c6a229a92ed3af91c004c17f8df2bb9ceec71562..6a6cc1df9be84e8debb71474ee6e9c061e5b0ad8 100644
--- a/src/Space/Shape/Point.hpp
+++ b/src/Space/Shape/Point.hpp
@@ -11,6 +11,11 @@
 #include "memory_ly/Encap.hpp"
 #include "Point_operators.hpp"
 
+template<typename T>
+struct native_encapsulated_type
+{
+	typedef T type;
+};
 
 /*! \brief This class implement the point shape in an N-dimensional space
  *
@@ -41,6 +46,7 @@ template<unsigned int dim ,typename T> class Point
 	//! Property id of the point
 	static const unsigned int x = 0;
 
+	typedef typename native_encapsulated_type<T[dim]>::type type_native;
 
 	/*! \brief Evaluate the expression and save the result on the point
 	 *
@@ -199,7 +205,7 @@ template<unsigned int dim ,typename T> class Point
 	 *
 	 */
 
-	__device__ __host__ inline T& operator[](size_t i)
+	__device__ __host__ inline T& operator[](unsigned int i)
 	{
 		return get(i);
 	}
@@ -212,7 +218,7 @@ template<unsigned int dim ,typename T> class Point
 	 *
 	 */
 
-	__device__ __host__ inline const T& operator[](size_t i) const
+	__device__ __host__ inline const T& operator[](unsigned int i) const
 	{
 		return get(i);
 	}
@@ -487,7 +493,8 @@ template<unsigned int dim ,typename T> class Point
 	 * \return itself
 	 *
 	 */
-	__device__ __host__ Point<dim,T> & operator=(const point_expression<T[dim]> & p_exp)
+	template<typename any>
+	__device__ __host__ Point<dim,T> & operator=(const point_expression<any> & p_exp)
 	{
 		p_exp.init();
 
@@ -702,4 +709,21 @@ template<typename T>
 struct is_Point<T, typename Void< typename T::yes_is_point>::type> : std::true_type
 {};
 
+/*! \brief like std::rank but it also work for openfpm structures like Point where it return 1
+ *
+ * \tparam T structure to check
+ *
+ */
+template<typename T, bool is_point = is_Point<T>::value>
+struct rank_gen
+{
+	typedef boost::mpl::int_<std::rank<T>::value> type;
+};
+
+template<typename T>
+struct rank_gen<T,true>
+{
+	typedef boost::mpl::int_<1> type;
+};
+
 #endif
diff --git a/src/Space/Shape/Point_operators.hpp b/src/Space/Shape/Point_operators.hpp
index 402a8c954f95849f0c4546ae329cf187f4aebf66..2ac668869492cb90772cee5ea19f9ca415b3dd21 100644
--- a/src/Space/Shape/Point_operators.hpp
+++ b/src/Space/Shape/Point_operators.hpp
@@ -55,64 +55,99 @@ template<unsigned int dim ,typename T> class Point;
 #define POINT_NEARBYINT 39
 #define POINT_RINT 40
 #define POINT_SUB_UNI 41
+#define POINT_NORM_INF 42
+
 
 /////////////////// Best cast rules ////////////////////////
 
+template<bool cond, typename exp1, typename exp2>
+struct first_or_second_pt
+{
+    typedef typename exp2::coord_type coord_type;
+};
+
+template<typename exp1, typename exp2>
+struct first_or_second_pt<true,exp1,exp2>
+{
+    typedef typename exp1::coord_type coord_type;
+};
+
+template<typename T, typename Sfinae = void>
+struct has_coordtype: std::false_type {};
+
+/*! \brief has_data check if a type has defined a member data
+ *
+ * ### Example
+ *
+ * \snippet util_test.hpp Check has_data
+ *
+ * return true if T::type is a valid type
+ *
+ */
+template<typename T>
+struct has_coordtype<T, typename Void<typename T::coord_type>::type> : std::true_type
+{};
+
 template<typename source1, typename source2>
-struct best_conv
+struct best_conv_impl
 {
 	typedef source1 type;
 };
 
-
 template<typename source2>
-struct best_conv<int,source2>
+struct best_conv_impl<int,source2>
 {
 	typedef source2 type;
 };
 
 template<typename source2>
-struct best_conv<long int,source2>
+struct best_conv_impl<long int,source2>
 {
 	typedef source2 type;
 };
 
 template<typename source2>
-struct best_conv<unsigned int,source2>
+struct best_conv_impl<unsigned int,source2>
 {
 	typedef source2 type;
 };
 
 template<typename source2>
-struct best_conv<unsigned long int,source2>
+struct best_conv_impl<unsigned long int,source2>
 {
 	typedef source2 type;
 };
 
 template<typename source1>
-struct best_conv<source1,int>
+struct best_conv_impl<source1,int>
 {
 	typedef source1 type;
 };
 
 template<typename source1>
-struct best_conv<source1,long int>
+struct best_conv_impl<source1,long int>
 {
 	typedef source1 type;
 };
 
 template<typename source1>
-struct best_conv<source1,unsigned int>
+struct best_conv_impl<source1,unsigned int>
 {
 	typedef source1 type;
 };
 
 template<typename source1>
-struct best_conv<source1,unsigned long int>
+struct best_conv_impl<source1,unsigned long int>
 {
 	typedef source1 type;
 };
 
+template<typename source1, typename source2>
+struct best_conv
+{
+	typedef typename best_conv_impl<typename std::remove_const<source1>::type,typename std::remove_const<source2>::type>::type type;
+};
+
 ///////////////////////////////////////////////////////////////
 
 constexpr unsigned int max_expr(unsigned int dim1, unsigned int dim2)
@@ -241,6 +276,8 @@ public:
 	//! this operation produce a vector as result of size dims
 	static const unsigned int nvals = 1;
 
+	typedef T coord_type;
+
 	/*! \brief constructor from a value
 	 *
 	 * \param d value
@@ -266,7 +303,19 @@ public:
 	 * \return It just return the velue set in the constructor
 	 *
 	 */
-	__device__ __host__  inline T value(const size_t k) const
+	__device__ __host__  inline const T & value(const int k) const
+	{
+		return d;
+	}
+
+	/*! \brief Evaluate the expression
+	 *
+	 * \param k coordinate to evaluate
+	 *
+	 * \return It just return the velue set in the constructor
+	 *
+	 */
+	__device__ __host__  inline T & value(const int k)
 	{
 		return d;
 	}
@@ -305,6 +354,9 @@ class point_expression_op<orig,exp1,exp2,POINT_SUM>
 
 public:
 
+    //! The type of the internal vector
+    typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;
+
 	//! original type of the point expression
 	typedef orig orig_type;
 
@@ -365,6 +417,20 @@ public:
 		init();
 		return o1.value(0) + o2.value(0);
 	}
+
+    /*! \brief Get the component i
+     *
+     * \param i component
+     *
+     * \return the i-component
+     *
+     */
+    template<typename r_type=typename best_conv<typename std::remove_reference<decltype(o1.value(0))>::type,
+            typename std::remove_reference<decltype(o2.value(0))>::type>::type >
+    __device__ __host__  inline r_type operator[](size_t i)
+    {
+        return o1.value(i) + o2.value(i);
+    }
 };
 
 /*! \brief Subtraction operation
@@ -383,6 +449,9 @@ class point_expression_op<orig, exp1,exp2,POINT_SUB>
 
 public:
 
+    //! The type of the internal vector
+    typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;
+
 	//! Original type
 	typedef orig orig_type;
 
@@ -443,6 +512,8 @@ public:
 		init();
 		return o1.value(0) - o2.value(0);
 	}
+
+
 };
 
 /*! \brief expression that subtract two points
@@ -478,6 +549,9 @@ public:
 	//! result dimensionality of this expression
 	static const unsigned int nvals = exp1::nvals;
 
+    //! The type of the internal vector
+    typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;
+
 	/*! constructor from expression
 	 *
 	 * \param o1 expression1
@@ -537,6 +611,9 @@ class point_expression_op<orig,exp1,exp2,POINT_MUL_POINT>
 
 public:
 
+    //! The type of the internal vector
+    typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;
+
 	//! base type of the expression
 	typedef orig orig_type;
 
@@ -617,6 +694,9 @@ class point_expression_op<orig,exp1,exp2,POINT_MUL>
 
 public:
 
+    //! The type of the internal vector
+    typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;
+
 	//! origin type
 	typedef orig orig_type;
 
@@ -694,6 +774,10 @@ class point_expression_op<orig,exp1,exp2,POINT_DIV>
 
 public:
 
+
+    //! The type of the internal vector
+    typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;
+
 	//! original type
 	typedef orig orig_type;
 
@@ -1769,6 +1853,9 @@ public:
 	//! this operation produce a vector as result of size dims
 	static const unsigned int nvals = dim;
 
+    //! The type of the internal vector
+    typedef T coord_type;
+
 	/*! \brief constructor from an array
 	 *
 	 * \param d array of dimension dim
@@ -1820,6 +1907,22 @@ public:
 		return *this;
 	}
 
+	/*! \brief Array operator
+	 *
+	 */
+	__device__ __host__ T & operator[](int n)
+	{
+		return d[n];
+	}
+
+	/*! \brief Array operator
+	 *
+	 */
+	__device__ __host__ const T & operator[](int n) const
+	{
+		return d[n];
+	}
+
 	/*! \brief This function must be called before value
 	 *
 	 * it calculate the scalar product before return the values
@@ -1868,6 +1971,9 @@ public:
 	//! this operation produce a vector as result of size dims
 	static const unsigned int nvals = dim;
 
+    //! The type of the internal vector
+    typedef T coord_type;
+
 	/*! \brief construct from an array of dimension dim
 	 *
 	 * \param d array
@@ -1887,6 +1993,14 @@ public:
 	{
 	}
 
+	/*! \brief Array operator
+	 *
+	 */
+	T operator[](int n) const
+	{
+		return d[n];
+	}
+
 	/*! \brief Evaluate the expression at coordinate k
 	 *
 	 * It just return the value set in the constructor
@@ -1902,7 +2016,7 @@ public:
 	}
 };
 
-/*! \brief Specialization for a const array of dimension dim
+/*! \brief Specialization for views
  *
  * \tparam T type of the array
  * \tparam dim dimensionality of the array
@@ -1912,7 +2026,7 @@ template<typename T, typename vmpl>
 class point_expression<openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl>>
 {
 	//! array view of dimension dim
-	const openfpm::detail::multi_array::const_sub_array_openfpm<T,1,vmpl,const T *> d;
+	mutable openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl> d;
 
 public:
 
@@ -1925,13 +2039,40 @@ public:
 	//! this operation produce a vector as result of size dims
 	static const unsigned int nvals = subar_dim<vmpl>::type::value;
 
+    //! The type of the internal vector
+    typedef T coord_type;
+
+	/*! \brief Operator= for point expression
+	 *
+	 * \tparam orig origin type
+	 * \tparam exp1 expression 1
+	 * \tparam exp2 expression 2
+	 * \tparam op operation
+	 *
+	 * \param point expression
+	 *
+	 * \return a point expression
+	 *
+	 */
+	template<typename orig, typename exp1, typename exp2, unsigned int op>
+	__device__ __host__  point_expression<openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl>> & 
+	operator=(const point_expression_op<orig,exp1,exp2,op> & p_exp)
+	{
+		p_exp.init();
+
+		for (size_t i = 0; i < nvals ; i++)
+		{d[i] = p_exp.value(i);}
+
+		return *this;
+	}
+
 	/*! \brief construct from an array of dimension dim
 	 *
 	 * \param d array
 	 *
 	 */
 	__device__ __host__ inline point_expression(const openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl> & d)
-	:d(d.origin(),d.strides())
+	:d(d.origin_mutable(),d.strides())
 	{
 	}
 
@@ -1944,6 +2085,26 @@ public:
 	{
 	}
 
+	/*! \brief Same as value()
+	 *
+	 * \param k coordinate
+	 * 
+	 */
+	__device__ __host__ inline T & operator[](int k)
+	{
+		return value(k);
+	}
+
+	/*! \brief Same as value()
+	 *
+	 * \param k coordinate
+	 * 
+	 */
+	__device__ __host__ inline T & operator[](int k) const
+	{
+		return value(k);
+	}
+
 	/*! \brief Evaluate the expression at coordinate k
 	 *
 	 * It just return the value set in the constructor
@@ -1953,7 +2114,21 @@ public:
 	 * \return the value
 	 *
 	 */
-	__device__ __host__ inline T value(const size_t k) const
+	__device__ __host__ inline T & value(const int k)
+	{
+		return d[k];
+	}
+
+	/*! \brief Evaluate the expression at coordinate k
+	 *
+	 * It just return the value set in the constructor
+	 *
+	 * \param k coordinate
+	 *
+	 * \return the value
+	 *
+	 */
+	__device__ __host__ inline T & value(const int k) const
 	{
 		return d[k];
 	}
diff --git a/src/Space/Shape/Point_operators_functions.hpp b/src/Space/Shape/Point_operators_functions.hpp
index 8b95323abbff38558194b7ad16a3c9e4716d27ae..5b84af14c1c92d97a73b57960b9a27e20abb3537 100644
--- a/src/Space/Shape/Point_operators_functions.hpp
+++ b/src/Space/Shape/Point_operators_functions.hpp
@@ -28,6 +28,9 @@ public:\
 	typedef typename orig::coord_type return_type;\
 \
 	static const unsigned int nvals = exp1::nvals;\
+	\
+	typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;\
+	\
 \
 	__device__ __host__ inline explicit point_expression_op(const exp1 & o1)\
 	:o1(o1),scal(0)\
@@ -77,6 +80,84 @@ fun_name(const Point<dim,T> & va)\
 	return exp_sum;\
 }
 
+/*! \brief Point norm Infinity operation
+ *
+ * \tparam orig original type
+ * \tparam exp1 expression1
+ * \tparam exp2 expression2
+ * \tparam op operation
+ *
+ */
+template <typename orig, typename exp1, typename exp2>
+class point_expression_op<orig,exp1,exp2,POINT_NORM_INF>
+{
+    const exp1 o1;
+
+    //! Scalar value
+    mutable typename std::remove_const<typename orig::coord_type>::type scal;
+
+public:
+
+    //! Origin type
+    typedef orig orig_type;
+
+    //! indicate that this class encapsulate an expression
+    typedef int is_expression;
+
+    //! indicate that init must be called before value
+    typedef int has_init;
+
+    //! return type of the expression
+    typedef typename orig::coord_type return_type;
+
+    //! this operation produce a scalar as result
+    static const unsigned int nvals = 1;
+
+    typedef typename exp1::coord_type coord_type;
+
+    //! Constructor from expression
+    __device__ __host__ inline explicit point_expression_op(const exp1 & o1)
+            :o1(o1),scal(0.0)
+    {}
+
+    /*! \brief This function must be called before value
+     *
+     * it calculate the scalar product before return the values
+     *
+     */
+    __device__ __host__ inline void init() const
+    {
+        scal = 0.0;
+
+        for (size_t i = 0 ; i < orig::dims ; i++) {
+            if (fabs(o1.value(i)) > scal) {
+                scal=fabs(o1.value(i));
+            }
+        }
+            //scal += o1.value(i) * o1.value(i);
+
+        //scal = sqrt(scal);
+
+        o1.init();
+    }
+
+    /*! \brief Evaluate the expression
+     *
+     * \param key where to evaluate the expression
+     *
+     */
+    template<typename r_type=typename std::remove_reference<decltype(o1.value(0))>::type > __device__ __host__ inline r_type value(size_t k) const
+    {
+        return scal;
+    }
+
+    template <typename T>__device__ __host__  operator T() const
+    {
+        init();
+        return scal;
+    }
+};
+
 
 /*! \brief Point norm operation
  *
@@ -111,6 +192,8 @@ public:
 	//! this operation produce a scalar as result
 	static const unsigned int nvals = 1;
 
+    typedef typename exp1::coord_type coord_type;
+
 	//! Constructor from expression
 	__device__ __host__ inline explicit point_expression_op(const exp1 & o1)
 	:o1(o1),scal(0.0)
@@ -183,6 +266,8 @@ public:
 	//! this operation produce a vector as result of size dims
 	static const unsigned int nvals = 1;
 
+	typedef typename exp1::coord_type coord_type;
+
 	//! constructor from an expression
 	__device__ __host__  inline explicit point_expression_op(const exp1 & o1)
 	:o1(o1),scal(0.0)
@@ -421,6 +506,84 @@ template <typename T, typename P> __device__ __host__  auto distance(T exp1, P e
 }
 
 
+
+//////////////////////////////////////// normINF operation //////////////////
+
+/* \brief Calculate the norm of the point
+ *
+ * \param va point expression one
+ *
+ * \return an object that encapsulate the expression
+ *
+ */
+template<typename orig,typename exp1, typename exp2, unsigned int op1>
+__device__ __host__ inline point_expression_op<orig,point_expression_op<orig,exp1,exp2,op1>,void,POINT_NORM_INF>
+norm_inf(const point_expression_op<orig,exp1,exp2,op1> & va)
+{
+    point_expression_op<orig,point_expression_op<orig,exp1,exp2,op1>,void,POINT_NORM_INF> exp_sum(va);
+
+    return exp_sum;
+}
+
+
+/* \brief norm of a scalar
+ *
+ * \param d scalar double or float
+ *
+ * \return d
+ *
+ */
+template <typename T>__device__ __host__ T norm_inf(T d)
+{
+    return abs(d);
+}
+
+
+/* \brief Calculate the norm of the point
+ *
+ * \param va point expression one
+ *
+ * \return an object that encapsulate the expression
+ *
+ */
+template<unsigned int dim, typename T>
+__device__ __host__ inline point_expression_op<Point<dim,T>,point_expression<T[dim]>,void,POINT_NORM_INF>
+norm_inf(const point_expression<T[dim]> & d)
+{
+    point_expression_op<Point<dim,T>,point_expression<T[dim]>,void,POINT_NORM_INF> exp_sum( (point_expression<T[dim]>(d)) );
+
+    return exp_sum;
+}
+
+
+/* \brief Divide two points expression
+ *
+ * \param va point expression one
+ * \param vb point expression two
+ *
+ * \return an object that encapsulate the expression
+ *
+ */
+template<unsigned int dim, typename T>
+__device__ __host__ inline point_expression_op<Point<dim,T>,Point<dim,T>,void,POINT_NORM_INF>
+norm_inf(const Point<dim,T> & va)
+{
+    point_expression_op<Point<dim,T>,Point<dim,T>,void,POINT_NORM_INF> exp_sum(va);
+
+    return exp_sum;
+}
+
+
+/*! \brief General distance formula
+ *
+ *
+ */
+template <typename T, typename P> __device__ __host__  auto distance_inf(T exp1, P exp2) -> decltype(norm_inf(exp1 - exp2))
+{
+    return norm_inf(exp1 - exp2);
+}
+
+
 ///// Here we define point operator functions
 
 CREATE_ARG_FUNC(std::abs,abs,POINT_ABS)
diff --git a/src/Space/Shape/Point_unit_test.hpp b/src/Space/Shape/Point_unit_test.hpp
index d903a5354e0cd53963ac2a189f3e1ea56a17c38f..5e61e279a614c22d721eae81405a2b90c9533696 100644
--- a/src/Space/Shape/Point_unit_test.hpp
+++ b/src/Space/Shape/Point_unit_test.hpp
@@ -554,7 +554,7 @@ BOOST_AUTO_TEST_CASE( Point_expression_usage_with_conversion )
 	double tmp = 5 + (p1 * p2);
 	double check = 5 + p1.get(0)*p2.get(0) + p1.get(1)*p2.get(1) + p1.get(2)*p2.get(2);
 	BOOST_REQUIRE_EQUAL(tmp,check);
-	p3 = 5 + (p1 * p2);
+	p3 = 5.0 + (p1 * p2);
 	for (size_t i = 0 ; i < 3 ; i++)	{BOOST_REQUIRE_EQUAL(p3.get(i),check) ;}
 	tmp = 5 - (p1 * p2);
 	check = 5 - p1.get(0)*p2.get(0) - p1.get(1)*p2.get(1) - p1.get(2)*p2.get(2);
diff --git a/src/Space/SpaceBox.hpp b/src/Space/SpaceBox.hpp
index 0b4fdb8842c2f18009bb6f18697d13320d96edca..b595b3e26bda5f1c78d8ddd1a40b8700dbefebf1 100644
--- a/src/Space/SpaceBox.hpp
+++ b/src/Space/SpaceBox.hpp
@@ -211,7 +211,7 @@ class SpaceBox : public Box<dim,T>
 		Point<dim,T> p;
 
 		for (size_t i = 0 ; i < dim ; i++)
-			p.get(i) = ((T)rand())/RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i);
+			p.get(i) = ((T)rand())/(T)RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i);
 
 		return p;
 	}
diff --git a/src/SparseGrid/SparseGrid.hpp b/src/SparseGrid/SparseGrid.hpp
index 6bbcb4028d38a4f2b8a4d4c29fb21160c331c35b..f2d0beaf62816e406c73b9ef88e936f2ae0f2cf0 100644
--- a/src/SparseGrid/SparseGrid.hpp
+++ b/src/SparseGrid/SparseGrid.hpp
@@ -28,83 +28,8 @@
 #include "VTKWriter/VTKWriter.hpp"
 #endif
 
-template<typename chunk_def>
-struct sparse_grid_bck_value
-{
-	chunk_def bck;
-
-	sparse_grid_bck_value(chunk_def bck)
-	:bck(bck)
-	{}
-
-	template<unsigned int p>
-	auto get() -> decltype(bck.template get<p>()[0])
-	{
-		return bck.template get<p>()[0];
-	}
-
-	template<unsigned int p>
-	auto get() const -> decltype(bck.template get<p>()[0])
-	{
-		return bck.template get<p>()[0];
-	}
-};
-
-template<typename ArrTypeView>
-struct std_array_vector_view
-{
-	int pos;
-	ArrTypeView arr;
-
-	std_array_vector_view(int pos,ArrTypeView arr)
-	:pos(pos),arr(arr)
-	{}
-
-	decltype(arr[0][0]) operator[](int comp)
-	{
-		return arr[comp][pos];
-	}
-
-	decltype(std::declval<const ArrTypeView>()[0][0]) operator[](int comp) const
-	{
-		return arr[comp][pos];
-	}
-};
-
 
 
-template<typename T>
-struct get_selector
-{
-	template<unsigned int p, typename chunks_vector_type>
-	static T & get(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
-	{
-		return chunks.template get<p>(active_cnk)[ele_id];
-	}
-
-	template<unsigned int p, typename chunks_vector_type>
-	static const T & get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
-	{
-		return chunks.template get<p>(active_cnk)[ele_id];
-	}
-};
-
-template<typename T, unsigned int N1>
-struct get_selector<T[N1]>
-{
-	template<unsigned int p, typename chunks_vector_type>
-	static std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
-	{
-		return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk));
-	}
-
-	template<unsigned int p, typename chunks_vector_type>
-	static const std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
-	{
-		return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk));
-	}
-};
-
 
 template<typename Tsrc,typename Tdst>
 class copy_bck
@@ -974,7 +899,11 @@ public:
 	void setBackgroundValue(const typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type & val)
 	{
 		for (int i = 0 ; i < chunking::size::value ; i++)
-		{meta_copy<typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type>::meta_copy_(val,chunks.get(0).template get<p>()[i]);}
+		{
+			meta_copy<typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type>::
+			          meta_copy_(val,
+			                     get_selector<typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type>::template get<p>(chunks,0,i));
+		}
 	}
 
 	/*! \brief Get the background value
@@ -2018,9 +1947,13 @@ public:
 
 			size_t pos_src_id = key_src_s.getPos();
 			size_t pos_dst_id;
-			auto block_src = grid_src.getBlock(key_src_s);
+
+			///////// WARNING insert_o MUST be done before getBlock, otherwise if grid_src == *this, and insert_o produce a reallocation 
+			///////// block_src will be invalidated  //////
 			auto block_dst = this->insert_o(key_dst,pos_dst_id);
 
+			auto block_src = grid_src.getBlock(key_src_s);
+
 			copy_sparse_to_sparse_bb<dim,decltype(block_src),decltype(block_dst),T> caps(block_src,block_dst,pos_src_id,pos_dst_id);
 			boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(caps);
 
diff --git a/src/SparseGrid/SparseGridUtil.hpp b/src/SparseGrid/SparseGridUtil.hpp
index 98841c1e2872485ed2b65e6a62af9b6b565c1a8c..f5da68ef3b7033d227f26b2947becee0076c36fc 100644
--- a/src/SparseGrid/SparseGridUtil.hpp
+++ b/src/SparseGrid/SparseGridUtil.hpp
@@ -608,4 +608,59 @@ struct sublin<8,chunk>
 };
 
 
+template<typename chunk_def>
+struct sparse_grid_bck_value
+{
+	chunk_def bck;
+
+	sparse_grid_bck_value(chunk_def bck)
+	:bck(bck)
+	{}
+
+	template<unsigned int p>
+	auto get() -> decltype(bck.template get<p>()[0])
+	{
+		return bck.template get<p>()[0];
+	}
+
+	template<unsigned int p>
+	auto get() const -> decltype(bck.template get<p>()[0])
+	{
+		return bck.template get<p>()[0];
+	}
+};
+
+
+template<typename T>
+struct get_selector
+{
+	template<unsigned int p, typename chunks_vector_type>
+	static T & get(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
+	{
+		return chunks.template get<p>(active_cnk)[ele_id];
+	}
+
+	template<unsigned int p, typename chunks_vector_type>
+	static const T & get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
+	{
+		return chunks.template get<p>(active_cnk)[ele_id];
+	}
+};
+
+template<typename T, unsigned int N1>
+struct get_selector<T[N1]>
+{
+	template<unsigned int p, typename chunks_vector_type>
+	static std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
+	{
+		return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk));
+	}
+
+	template<unsigned int p, typename chunks_vector_type>
+	static const std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id)
+	{
+		return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk));
+	}
+};
+
 #endif /* OPENFPM_DATA_SRC_SPARSEGRID_SPARSEGRIDUTIL_HPP_ */
diff --git a/src/SparseGrid/SparseGrid_chunk_copy.hpp b/src/SparseGrid/SparseGrid_chunk_copy.hpp
index 41b8ff729dc227a245ec9dd00d0b0682d06007eb..8fe4d603a0af628f87275bc144c9995b0a8a9ad3 100644
--- a/src/SparseGrid/SparseGrid_chunk_copy.hpp
+++ b/src/SparseGrid/SparseGrid_chunk_copy.hpp
@@ -8,9 +8,11 @@
 #ifndef SPARSEGRID_CHUNK_COPY_HPP_
 #define SPARSEGRID_CHUNK_COPY_HPP_
 
-#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
+#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
+// Nvcc does not like VC ... for some reason
 #include <Vc/Vc>
 #endif
+
 #include "util/mathutil.hpp"
 
 
diff --git a/src/SparseGrid/SparseGrid_conv_opt.hpp b/src/SparseGrid/SparseGrid_conv_opt.hpp
index 7d50d97fb3d80c420ac00613a4bbcee064ebd7e8..c8e702950612359161e0b9e83207d283c723c8d8 100644
--- a/src/SparseGrid/SparseGrid_conv_opt.hpp
+++ b/src/SparseGrid/SparseGrid_conv_opt.hpp
@@ -110,7 +110,7 @@ struct conv_impl
 	}
 };
 
-#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
+#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
 
 
 template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type>
@@ -199,14 +199,16 @@ void load_crs_v(vect_type & cs1, chunk_type & chunk,  ids_type & ids)
 	}
 }
 
+
+template<typename prop_type>
 struct cross_stencil_v
 {
-	Vc::double_v xm;
-	Vc::double_v xp;
-	Vc::double_v ym;
-	Vc::double_v yp;
-	Vc::double_v zm;
-	Vc::double_v zp;
+	Vc::Vector<prop_type> xm;
+	Vc::Vector<prop_type> xp;
+	Vc::Vector<prop_type> ym;
+	Vc::Vector<prop_type> yp;
+	Vc::Vector<prop_type> zm;
+	Vc::Vector<prop_type> zp;
 };
 
 template<>
@@ -421,7 +423,7 @@ struct conv_impl<3>
 					for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
 					{
 						// we do only id exist the point
-						if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
+						if (*(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
 
 						data_il<Vc::Vector<prop_type>::Size> mxm;
 						data_il<Vc::Vector<prop_type>::Size> mxp;
@@ -430,7 +432,7 @@ struct conv_impl<3>
 						data_il<Vc::Vector<prop_type>::Size> mzm;
 						data_il<Vc::Vector<prop_type>::Size> mzp;
 
-						cross_stencil_v cs;
+						cross_stencil_v<prop_type> cs;
 
 						Vc::Vector<prop_type> cmd(&chunk.template get<prop_src>()[s2]);
 
@@ -451,37 +453,21 @@ struct conv_impl<3>
 						long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
 						sumzp += s2;
 
-						if (Vc::Vector<prop_type>::Size == 2)
+						if (Vc::Vector<prop_type>::Size == 2 || Vc::Vector<prop_type>::Size == 4 || Vc::Vector<prop_type>::Size == 8)
 						{
-							mxm.i = *(short int *)&mask.mask[s2];
+							mxm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2];
 							mxm.i = mxm.i << 8;
-							mxm.i |= (short int)mask.mask[sumxm];
+							mxm.i |= (typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxm];
 
-							mxp.i = *(short int *)&mask.mask[s2];
+							mxp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2];
 							mxp.i = mxp.i >> 8;
-							mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
+							mxp.i |= ((typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
 
-							mym.i = *(short int *)&mask.mask[sumym];
-							myp.i = *(short int *)&mask.mask[sumyp];
+							mym.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumym];
+							myp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumyp];
 
-							mzm.i = *(short int *)&mask.mask[sumzm];
-							mzp.i = *(short int *)&mask.mask[sumzp];
-						}
-						else if (Vc::Vector<prop_type>::Size == 4)
-						{
-							mxm.i = *(int *)&mask.mask[s2];
-							mxm.i = mxm.i << 8;
-							mxm.i |= (int)mask.mask[sumxm];
-
-							mxp.i = *(int *)&mask.mask[s2];
-							mxp.i = mxp.i >> 8;
-							mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
-
-							mym.i = *(int *)&mask.mask[sumym];
-							myp.i = *(int *)&mask.mask[sumyp];
-
-							mzm.i = *(int *)&mask.mask[sumzm];
-							mzp.i = *(int *)&mask.mask[sumzp];
+							mzm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzm];
+							mzp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzp];
 						}
 						else
 						{
@@ -737,7 +723,7 @@ struct conv_impl<3>
 					for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
 					{
 						// we do only id exist the point
-						if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
+						if (*(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
 
 						data_il<4> mxm;
 						data_il<4> mxp;
@@ -746,8 +732,8 @@ struct conv_impl<3>
 						data_il<4> mzm;
 						data_il<4> mzp;
 
-						cross_stencil_v cs1;
-						cross_stencil_v cs2;
+						cross_stencil_v<prop_type> cs1;
+						cross_stencil_v<prop_type> cs2;
 
 						Vc::Vector<prop_type> cmd1(&chunk.template get<prop_src1>()[s2]);
 						Vc::Vector<prop_type> cmd2(&chunk.template get<prop_src2>()[s2]);
@@ -769,41 +755,21 @@ struct conv_impl<3>
 						long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
 						sumzp += s2;
 
-						if (Vc::Vector<prop_type>::Size == 2)
+						if (Vc::Vector<prop_type>::Size == 1 || Vc::Vector<prop_type>::Size == 2 || Vc::Vector<prop_type>::Size == 4 || Vc::Vector<prop_type>::Size == 8)
 						{
-							mxm.i = *(short int *)&mask.mask[s2];
+							mxm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2];
 							mxm.i = mxm.i << 8;
-							mxm.i |= (short int)mask.mask[sumxm];
+							mxm.i |= (typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxm];
 
-							mxp.i = *(short int *)&mask.mask[s2];
+							mxp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2];
 							mxp.i = mxp.i >> 8;
-							mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
+							mxp.i |= ((typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
 
-							mym.i = *(short int *)&mask.mask[sumym];
-							myp.i = *(short int *)&mask.mask[sumyp];
+							mym.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumym];
+							myp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumyp];
 
-							mzm.i = *(short int *)&mask.mask[sumzm];
-							mzp.i = *(short int *)&mask.mask[sumzp];
-						}
-						else if (Vc::Vector<prop_type>::Size == 4)
-						{
-							mxm.i = *(int *)&mask.mask[s2];
-							mxm.i = mxm.i << 8;
-							mxm.i |= (int)mask.mask[sumxm];
-
-							mxp.i = *(int *)&mask.mask[s2];
-							mxp.i = mxp.i >> 8;
-							mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
-
-							mym.i = *(int *)&mask.mask[sumym];
-							myp.i = *(int *)&mask.mask[sumyp];
-
-							mzm.i = *(int *)&mask.mask[sumzm];
-							mzp.i = *(int *)&mask.mask[sumzp];
-						}
-						else
-						{
-							std::cout << __FILE__ << ":" << __LINE__ << " UNSUPPORTED" << std::endl;
+							mzm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzm];
+							mzp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzp];
 						}
 
 						cs1.xm = cmd1;
diff --git a/src/SparseGrid/SparseGrid_iterator_block.hpp b/src/SparseGrid/SparseGrid_iterator_block.hpp
index 639215a7f644670d7b2397be9a2c136e00ae42b2..9a37ee99ce68645e76102346f7d167a8541a7a93 100644
--- a/src/SparseGrid/SparseGrid_iterator_block.hpp
+++ b/src/SparseGrid/SparseGrid_iterator_block.hpp
@@ -8,6 +8,19 @@
 #ifndef SPARSEGRID_ITERATOR_BLOCK_HPP_
 #define SPARSEGRID_ITERATOR_BLOCK_HPP_
 
+#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
+// Nvcc does not like VC ... for some reason
+#include <Vc/Vc>
+#define DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR (Vc::float_v::Size *sizeof(float) >= sizeof(T))
+
+#else
+
+#define DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR 1
+
+#endif
+
+
+
 #include "Grid/iterators/grid_skin_iterator.hpp"
 #include "SparseGrid_chunk_copy.hpp"
 
@@ -270,7 +283,7 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 
 		auto & chunk = data.template get<prop>(chunk_id);
 
-		copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,false>::template copy<N1>(arr,mask,h,chunk);
+		copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR  ,prop,stencil_size,typename vector_blocks_exts::type,false>::template copy<N1>(arr,mask,h,chunk);
 	}
 
 
@@ -285,7 +298,7 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 
 		auto & chunk = data.template get<prop>(chunk_id);
 
-		copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,false>::template store<N1>(arr,chunk);
+		copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,false>::template store<N1>(arr,chunk);
 
 	}
 
@@ -340,11 +353,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 		if (exist == true)
 		{
 			auto & h = header_mask.get(r);
-			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz2::value,N1>(arr,mask,h,data.get(r));
+			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz2::value,N1>(arr,mask,h,data.get(r));
 		}
 		else
 		{
-			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz2::value,N1>(mask);
+			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz2::value,N1>(mask);
 		}
 		if (findNN == false)
 		{
@@ -360,11 +373,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 		if (exist == true)
 		{
 			auto & h = header_mask.get(r);
-			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz2::value - stencil_size,0,N1>(arr,mask,h,data.get(r));
+			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz2::value - stencil_size,0,N1>(arr,mask,h,data.get(r));
 		}
 		else
 		{
-			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask);
+			copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask);
 		}
 
 		if (findNN == false)
@@ -381,11 +394,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 		if (exist == true)
 		{
 			auto & h = header_mask.get(r);
-			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz1::value,N1>(arr,mask,h,data.get(r));
+			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz1::value,N1>(arr,mask,h,data.get(r));
 		}
 		else
 		{
-			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz1::value,N1>(mask);
+			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz1::value,N1>(mask);
 		}
 		if (findNN == false)
 		{
@@ -401,11 +414,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 		if (exist == true)
 		{
 			auto & h = header_mask.get(r);
-			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz1::value-stencil_size,0,N1>(arr,mask,h,data.get(r));
+			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz1::value-stencil_size,0,N1>(arr,mask,h,data.get(r));
 		}
 		else
 		{
-			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask);
+			copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask);
 		}
 
 		if (findNN == false)
@@ -422,11 +435,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 		if (exist == true)
 		{
 			auto & h = header_mask.get(r);
-			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,sz0::value+stencil_size,N1>(arr,mask,h,data.get(r));
+			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,sz0::value+stencil_size,N1>(arr,mask,h,data.get(r));
 		}
 		else
 		{
-			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<sz0::value+stencil_size,N1>(mask);
+			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<sz0::value+stencil_size,N1>(mask);
 		}
 		if (findNN == false)
 		{
@@ -442,11 +455,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext>
 		if (exist == true)
 		{
 			auto & h = header_mask.get(r);
-			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz0::value-stencil_size,0,N1>(arr,mask,h,data.get(r));
+			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz0::value-stencil_size,0,N1>(arr,mask,h,data.get(r));
 		}
 		else
 		{
-			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask);
+			copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value  && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask);
 		}
 	}
 };
diff --git a/src/SparseGrid/SparseGrid_unit_tests.cpp b/src/SparseGrid/SparseGrid_unit_tests.cpp
index ba7026a91e7fc53cf7c3802cd3cb655a8c8efccd..428de8c109e2aee8cc0aec585348d2cccff3d925 100644
--- a/src/SparseGrid/SparseGrid_unit_tests.cpp
+++ b/src/SparseGrid/SparseGrid_unit_tests.cpp
@@ -1378,7 +1378,7 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_cross_simplified)
 
 	for (int i = 0 ; i < 1 ; i++)
 	{
-		grid.conv_cross<0,1,1>(start,stop,[]( Vc::double_v & cmd, cross_stencil_v & s,
+		grid.conv_cross<0,1,1>(start,stop,[]( Vc::double_v & cmd, cross_stencil_v<double> & s,
 				                              unsigned char * mask_sum){
 
 																	Vc::double_v Lap = s.xm + s.xp +
@@ -1456,6 +1456,108 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_cross_simplified)
 //	print_grid("debug_out",grid);
 }
 
+BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_cross_simplified_float)
+{
+	size_t sz[3] = {501,501,501};
+	size_t sz_cell[3] = {500,500,500};
+
+	sgrid_soa<3,aggregate<float,float,int>,HeapMemory> grid(sz);
+
+	grid.getBackgroundValue().template get<0>() = 0.0;
+
+	CellDecomposer_sm<3, float, shift<3,float>> cdsm;
+
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	cdsm.setDimensions(domain, sz_cell, 0);
+
+	fill_sphere_quad(grid,cdsm);
+
+	//grid.reorder();
+
+	grid_key_dx<3> start({1,1,1});
+	grid_key_dx<3> stop({499,499,499});
+
+	for (int i = 0 ; i < 1 ; i++)
+	{
+		grid.conv_cross<0,1,1>(start,stop,[]( Vc::float_v & cmd, cross_stencil_v<float> & s,
+				                              unsigned char * mask_sum){
+
+																	Vc::float_v Lap = s.xm + s.xp +
+																					   s.ym + s.yp +
+																					   s.zm + s.zp - 6.0f*cmd;
+
+																	Vc::Mask<float> surround;
+
+																	for (int i = 0 ; i < Vc::float_v::Size ; i++)
+																	{surround[i] = (mask_sum[i] == 6);}
+
+																	Lap = Vc::iif(surround,Lap,Vc::float_v(1.0f));
+
+																	return Lap;
+		                                                         });
+	}
+
+	int tot_six = 0;
+	int tot_one = 0;
+
+	bool check = true;
+	auto it2 = grid.getIterator(start,stop);
+	while (it2.isNext())
+	{
+		auto p = it2.get();
+
+		check &= (grid.template get<1>(p) == 6 || grid.template get<1>(p) == 1);
+
+		// Check the six should be a six
+		auto xp = p.move(0,1);
+		auto xm = p.move(0,-1);
+
+		auto yp = p.move(1,1);
+		auto ym = p.move(1,-1);
+
+		auto zp = p.move(2,1);
+		auto zm = p.move(2,-1);
+
+		bool is_six;
+		if (grid.existPoint(xp) && grid.existPoint(xm) &&
+			grid.existPoint(yp) && grid.existPoint(ym) &&
+			grid.existPoint(zp) && grid.existPoint(zm))
+		{
+			is_six = true;
+		}
+		else
+		{
+			is_six = false;
+		}
+
+		if (is_six == true && grid.template get<1>(p) != 6.0)
+		{
+			check = false;
+			break;
+		}
+
+		if (grid.template get<1>(p) == 1)
+		{
+			tot_one++;
+		}
+
+		if (grid.template get<1>(p) == 6)
+		{
+			tot_six++;
+		}
+
+		++it2;
+	}
+
+	BOOST_REQUIRE_EQUAL(check,true);
+	BOOST_REQUIRE_EQUAL(tot_six,15857813);
+	BOOST_REQUIRE_EQUAL(tot_one,2977262);
+	// Check correct-ness
+
+//	print_grid("debug_out",grid);
+}
+
 constexpr int x = 0;
 constexpr int y = 1;
 constexpr int z = 2;
diff --git a/src/SparseGridGpu/BlockMapGpu.hpp b/src/SparseGridGpu/BlockMapGpu.hpp
index cc336f58d191177428256fe331d5535c10c053dd..92a7f651c7cba96c9631fadc954b56d14e0c7aac 100644
--- a/src/SparseGridGpu/BlockMapGpu.hpp
+++ b/src/SparseGridGpu/BlockMapGpu.hpp
@@ -7,17 +7,6 @@
 #include "DataBlock.cuh"
 #include <set>
 
-template<typename BlockT, typename T>
-struct AggregateAppend
-{
-};
-
-template<typename BlockT, typename ... list>
-struct AggregateAppend<BlockT, aggregate<list ...>>
-{
-    typedef aggregate<list..., BlockT> type;
-};
-
 template<typename AggregateT, unsigned int p>
 using BlockTypeOf = typename std::remove_reference<typename boost::fusion::result_of::at_c<typename AggregateT::type, p>::type>::type;
 
@@ -29,6 +18,8 @@ class BlockMapGpu
 {
 private:
     typedef BlockTypeOf<AggregateBlockT, 0> BlockT0;
+    
+    bool is_new;
 
 #ifdef SE_CLASS1
 
@@ -119,13 +110,19 @@ public:
      *
      */
     template<unsigned int p>
-    auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId).template get<p>())
+    auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId,is_new).template get<p>())
     {
         typedef BlockTypeOf<AggregateBlockT, p> BlockT;
 
-        auto aggregate = blockMap.insertFlush(blockId);
-        auto &block = aggregate.template get<p>();
-
+        auto aggregate = blockMap.insertFlush(blockId,is_new);
+		auto &block = aggregate.template get<p>();
+     
+	    if (is_new == true)
+	    {
+		    for (int i = 0 ; i < BlockT::size ; i++)
+		    {aggregate.template get<pMask>()[i] = 0;}
+	    }
+        
         return block;
     }
 
@@ -136,9 +133,18 @@ public:
      * \return a reference to the block data
      *
      */
-    auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId))
+    auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId,is_new))
     {
-        return blockMap.insertFlush(blockId);
+	    typedef BlockTypeOf<AggregateBlockT, 0> BlockT;
+    	auto b = blockMap.insertFlush(blockId,is_new);
+    	
+    	if (is_new == true)
+	    {
+    		for (int i = 0 ; i < BlockT::size ; i++)
+		    {b.template get<pMask>()[i] = 0;}
+	    }
+    	
+        return b;
     }
 
     BlockMapGpu_ker<AggregateInternalT, indexT, layout_base> toKernel()
diff --git a/src/SparseGridGpu/BlockMapGpu_ker.cuh b/src/SparseGridGpu/BlockMapGpu_ker.cuh
index 1ef24069d83e0c97d27aa7c304cf36539552213c..b99de92fedc4c4386019d4db105624feb05e4ce2 100644
--- a/src/SparseGridGpu/BlockMapGpu_ker.cuh
+++ b/src/SparseGridGpu/BlockMapGpu_ker.cuh
@@ -103,18 +103,70 @@ public:
             : blockMap(blockMap) {};
 
     template<unsigned int p>
-    inline __device__ auto get(unsigned int linId) const -> ScalarTypeOf<AggregateBlockT, p>;
+    inline __device__ auto get(unsigned int linId) const -> ScalarTypeOf<AggregateBlockT, p>
+    {
+        #ifdef __NVCC__
+            typedef BlockTypeOf<AggregateBlockT, p> BlockT;
+            unsigned int blockId = linId / BlockT::size;
+            unsigned int offset = linId % BlockT::size;
+            return get<p>(blockId, offset);
+        #else // __NVCC__
+            std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
+        #endif // __NVCC__
+    }
+
 
     template<unsigned int p>
-    inline __device__ auto get(unsigned int blockId, unsigned int offset) const -> ScalarTypeOf<AggregateBlockT, p>;
+    inline __device__ auto get(unsigned int blockId, unsigned int offset) const -> ScalarTypeOf<AggregateBlockT, p>
+    {
+        #ifdef __NVCC__
+        
+            const auto sid = blockMap.get_sparse(blockId);
+            const auto & block = blockMap.template get_ele<p>(sid.id)[offset];
+            const auto mask = blockMap.template get_ele<pMask>(sid.id)[offset];
+            // Now check if the element actually exists
+            return exist(mask)
+                        ? block
+                        : blockMap.template getBackground<p>()[offset];
+        
+        #else // __NVCC__
+            std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
+        #endif // __NVCC__
+    }
 
-    inline __device__ auto getBlock(unsigned int blockId) -> decltype(blockMap.get(0));
+    inline __device__ auto getBlock(unsigned int blockId) -> decltype(blockMap.get(0))
+    {
+        #ifdef __NVCC__
+            return blockMap.get(blockId);
+        #else // __NVCC__
+            std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
+        #endif // __NVCC__
+    }
 
     template<unsigned int p>
-    inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int linId);
+    inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int linId)
+    {
+        // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D
+    #ifdef __NVCC__
+        typedef BlockTypeOf<AggregateBlockT, p> BlockT;
+        unsigned int blockId = linId / BlockT::size;
+        unsigned int offset = linId % BlockT::size;
+        return getReference<p>(blockId, offset);
+    #else // __NVCC__
+        std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
+    #endif // __NVCC__
+    }
 
     template<unsigned int p>
-    inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int blockId, unsigned int offset);
+    inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int blockId, unsigned int offset)
+    {
+        // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D
+    #ifdef __NVCC__
+        return blockMap.template get<p>(blockId)[offset];
+    #else // __NVCC__
+        std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
+    #endif // __NVCC__
+    }
 
     template<unsigned int p>
     inline __device__ auto insert(unsigned int linId) -> ScalarTypeOf<AggregateBlockT, p>&
@@ -130,7 +182,19 @@ public:
     }
 
     template<unsigned int p>
-    inline __device__ auto insert(unsigned int blockId, unsigned int offset) -> ScalarTypeOf<AggregateBlockT, p>&;
+    inline __device__ auto insert(unsigned int blockId, unsigned int offset) -> ScalarTypeOf<AggregateBlockT, p>&
+    {
+        #ifdef __NVCC__
+            auto aggregate = blockMap.insert(blockId);
+            auto &block = aggregate.template get<p>();
+            auto &mask = aggregate.template get<pMask>();
+            setExist(mask[offset]);
+        
+            return block[offset];
+        #else // __NVCC__
+            std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
+        #endif // __NVCC__
+    }
 
     template<unsigned int nChunksPerBlocks = 1>
     inline __device__ auto insertBlock(indexT blockId, unsigned int stride = 8192) -> decltype(blockMap.insert(0))
@@ -313,108 +377,7 @@ public:
 
 };
 
-template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
-template<unsigned int p>
-inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base>
-::get(unsigned int linId) const -> ScalarTypeOf<AggregateBlockT, p>
-{
-#ifdef __NVCC__
-    typedef BlockTypeOf<AggregateBlockT, p> BlockT;
-    unsigned int blockId = linId / BlockT::size;
-    unsigned int offset = linId % BlockT::size;
-    return get<p>(blockId, offset);
-#else // __NVCC__
-    std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
-#endif // __NVCC__
-}
-
 
-template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
-template<unsigned int p>
-inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base>
-::get(unsigned int blockId, unsigned int offset) const -> ScalarTypeOf<AggregateBlockT, p>
-{
-#ifdef __NVCC__
-//    const auto & aggregate = blockMap.get(blockId);
-//    const auto & block = aggregate.template get<p>();
-//    const auto & mask = aggregate.template get<pMask>();
-//    // Now check if the element actually exists
-//    return exist(mask[offset])
-//                ? block[offset]
-//                : blockMap.template getBackground<p>()[offset];
-////    return blockMap.template get<p>(blockId)[offset];
-
-    const auto sid = blockMap.get_sparse(blockId);
-    const auto & block = blockMap.template get_ele<p>(sid.id)[offset];
-    const auto mask = blockMap.template get_ele<pMask>(sid.id)[offset];
-    // Now check if the element actually exists
-    return exist(mask)
-                ? block
-                : blockMap.template getBackground<p>()[offset];
-
-#else // __NVCC__
-    std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
-#endif // __NVCC__
-}
-
-
-template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
-inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base>
-::getBlock(unsigned int blockId) -> decltype(blockMap.get(0))
-{
-#ifdef __NVCC__
-    return blockMap.get(blockId);
-#else // __NVCC__
-    std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
-#endif // __NVCC__
-}
-
-template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
-template<unsigned int p>
-inline __device__ ScalarTypeOf<AggregateBlockT, p> & BlockMapGpu_ker<AggregateBlockT, indexT, layout_base>
-::getReference(const unsigned int linId)
-{
-    // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D
-#ifdef __NVCC__
-    typedef BlockTypeOf<AggregateBlockT, p> BlockT;
-    unsigned int blockId = linId / BlockT::size;
-    unsigned int offset = linId % BlockT::size;
-    return getReference<p>(blockId, offset);
-#else // __NVCC__
-    std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
-#endif // __NVCC__
-}
-
-template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
-template<unsigned int p>
-inline __device__ ScalarTypeOf<AggregateBlockT, p> & BlockMapGpu_ker<AggregateBlockT, indexT, layout_base>
-::getReference(const unsigned int blockId, const unsigned int offset)
-{
-    // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D
-#ifdef __NVCC__
-    return blockMap.template get<p>(blockId)[offset];
-#else // __NVCC__
-    std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
-#endif // __NVCC__
-}
-
-
-template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
-template<unsigned int p>
-inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base>
-::insert(unsigned int blockId, unsigned int offset) -> ScalarTypeOf<AggregateBlockT, p>&
-{
-#ifdef __NVCC__
-    auto aggregate = blockMap.insert(blockId);
-    auto &block = aggregate.template get<p>();
-    auto &mask = aggregate.template get<pMask>();
-    setExist(mask[offset]);
-
-    return block[offset];
-#else // __NVCC__
-    std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
-#endif // __NVCC__
-}
 
 
 #endif /* BLOCK_MAP_GPU_KER_CUH_ */
diff --git a/src/SparseGridGpu/SparseGridGpu.hpp b/src/SparseGridGpu/SparseGridGpu.hpp
index c7b69fd7cddd8ee0f084fd389271c03e0b1dfd5e..7d3199361e2efa94d60bbc657acc47ae81631055 100644
--- a/src/SparseGridGpu/SparseGridGpu.hpp
+++ b/src/SparseGridGpu/SparseGridGpu.hpp
@@ -2416,6 +2416,46 @@ public:
 		applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
 	}
 
+    /*! \brief Apply a free type convolution using blocks
+     *
+     *
+     */
+	template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
+	void conv_cross_b(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
+	{
+		Box<dim,int> box;
+
+		for (int i = 0 ; i < dim ; i++)
+		{
+			box.setLow(i,start.get(i));
+			box.setHigh(i,stop.get(i));
+		}
+
+		constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
+
+		applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
+	}
+
+    /*! \brief Apply a free type convolution using blocks
+     *
+     *
+     */
+	template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
+	void conv2_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
+	{
+		Box<dim,int> box;
+
+		for (int i = 0 ; i < dim ; i++)
+		{
+			box.setLow(i,start.get(i));
+			box.setHigh(i,stop.get(i));
+		}
+
+        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
+
+		applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
+	}
+
     /*! \brief Apply a free type convolution using blocks
      *
      *
@@ -2520,7 +2560,33 @@ public:
     {
         return gridGeometry.BlockLinId(blockCoord);
     }
+	
+	/*! \brief Insert the point on host side and flush directly
+ *
+ * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
+ *
+ * \param grid point where to insert
+ *
+ * \return a reference to the data to fill
+ *
+ *
+ */
+	template<unsigned int p>
+	auto insertFlush(const sparse_grid_gpu_index<self> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
+	{
+		auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
+
+		indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
+		indexT local_id = coord.get_data_id();
+
+		typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;
 
+		auto block_data = this->insertBlockFlush(block_id);
+		block_data.template get<BMG::pMask>()[local_id] = 1;
+
+		return block_data.template get<p>()[local_id];
+	}
+    
     /*! \brief Insert the point on host side and flush directly
      *
      * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
diff --git a/src/SparseGridGpu/SparseGridGpu_kernels.cuh b/src/SparseGridGpu/SparseGridGpu_kernels.cuh
index 33edebbf6987a2a555eda1053903086e3ced7739..0a44dc4d813f58ebe8046048d1baceb2b763176e 100644
--- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh
+++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh
@@ -117,6 +117,17 @@ namespace SparseGridGpuKernels
 	template<>
 	struct stencil_conv_func_impl<3>
 	{
+		template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
+		__device__ static inline void stencil_block(ScalarT & res, coordType & coord ,
+				            CpBlockType & cpb,
+							DataBlockWrapperT & DataBlockLoad,
+							int offset,
+				            lambda_func f,
+				            ArgsT ... args)
+		{
+			res = f(cpb,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
+		}
+
 		template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
 		__device__ static inline void stencil(ScalarT & res, coordType & coord ,
 				            CpBlockType & cpb,
@@ -135,11 +146,34 @@ namespace SparseGridGpuKernels
 		{
 			f(res1,res2,cpb1,cpb2,coord[0],coord[1],coord[2]);
 		}
+
+		template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
+		__device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
+				            CpBlockType & cpb1,
+				            CpBlockType & cpb2,
+							DataBlockWrapperT & DataBlockLoad,
+							int offset,
+				            lambda_func f,
+				            ArgsT ... args)
+		{
+			f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
+		}
 	};
 
 	template<>
 	struct stencil_conv_func_impl<2>
 	{
+		template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
+		__device__ static inline void stencil_block(ScalarT & res, coordType & coord,
+				            CpBlockType & cpb,
+							DataBlockWrapperT & DataBlockLoad,
+							int offset,
+				            lambda_func f,
+				            ArgsT ... args)
+		{
+			res = f(cpb,DataBlockLoad,offset,coord[0],coord[1]);
+		}
+
 		template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT>
 		__device__ static inline void stencil(ScalarT & res, coordType & coord ,
 				            CpBlockType & cpb,
@@ -158,6 +192,18 @@ namespace SparseGridGpuKernels
 		{
 			f(res1,res2,cpb1,cpb2,coord[0],coord[1]);
 		}
+
+		template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
+		__device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord ,
+				            CpBlockType & cpb1,
+				            CpBlockType & cpb2,
+							DataBlockWrapperT & DataBlockLoad,
+							int offset,
+				            lambda_func f,
+				            ArgsT ... args)
+		{
+			f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
+		}
 	};
 
 	template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
@@ -233,6 +279,151 @@ namespace SparseGridGpuKernels
 	    }
 	};
 
+
+	template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
+	struct stencil_cross_func_conv_block_read
+	{
+		typedef NNStar<dim> stencil_type;
+
+		static constexpr unsigned int supportRadius = stencil_size;
+
+		template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
+		static inline __device__ void stencil(
+				SparseGridT & sparseGrid,
+				const unsigned int dataBlockId,
+				openfpm::sparse_index<unsigned int> dataBlockIdPos,
+				unsigned int offset,
+				grid_key_dx<dim, int> & pointCoord,
+				DataBlockWrapperT & dataBlockLoad,
+				DataBlockWrapperT & dataBlockStore,
+				unsigned char curMask,
+				lambda_func f,
+				ArgT ... args)
+		{
+	        typedef typename SparseGridT::AggregateBlockType AggregateT;
+	        typedef ScalarTypeOf<AggregateT, p_src> ScalarT;
+
+	        constexpr unsigned int enlargedBlockSize = IntPow<
+	                SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
+
+	        __shared__ ScalarT enlargedBlock[enlargedBlockSize];
+
+	        for (int i = 0; i < n_loop ; i++)
+	        {
+	        	if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize)
+	        	{
+	        		enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0];
+	        	}
+	        }
+
+	        __syncthreads();
+
+	        typedef typename vmpl_create_constant<dim,SparseGridT::blockEdgeSize_>::type block_sizes;
+	        typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
+
+	        cp_block<ScalarT,stencil_size,vmpl_sizes,dim> cpb(enlargedBlock);
+
+	        sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
+
+	        __syncthreads();
+
+	        ScalarT res = 0;
+
+	        if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
+	        {
+	        	int coord[dim];
+
+				unsigned int linIdTmp = offset;
+				for (unsigned int d = 0; d < dim; ++d)
+				{
+					coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
+					linIdTmp /= SparseGridT::blockEdgeSize_;
+				}
+
+	            stencil_conv_func_impl<dim>::stencil_block(res,coord,cpb,dataBlockLoad,offset,f,args...);
+
+	            dataBlockStore.template get<p_dst>()[offset] = res;
+	        }
+		}
+
+	    template <typename SparseGridT, typename CtxT>
+	    static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
+	    {
+	        // No flush
+	    }
+	};
+
+	template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
+	struct stencil_func_conv2_b
+	{
+		typedef NNStar<dim> stencil_type;
+
+		static constexpr unsigned int supportRadius = stencil_size;
+
+		template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
+		static inline __device__ void stencil(
+				SparseGridT & sparseGrid,
+				const unsigned int dataBlockId,
+				openfpm::sparse_index<unsigned int> dataBlockIdPos,
+				unsigned int offset,
+				grid_key_dx<dim, int> & pointCoord,
+				DataBlockWrapperT & dataBlockLoad,
+				DataBlockWrapperT & dataBlockStore,
+				unsigned char curMask,
+				lambda_func f,
+				ArgT ... args)
+		{
+	        typedef typename SparseGridT::AggregateBlockType AggregateT;
+	        typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
+	        typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
+
+	        constexpr unsigned int enlargedBlockSize = IntPow<
+	                SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
+
+	        __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
+	        __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
+
+	        // fill with background
+
+	        typedef typename vmpl_create_constant<dim,SparseGridT::blockEdgeSize_>::type block_sizes;
+	        typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
+
+	        cp_block<ScalarT1,stencil_size,vmpl_sizes,dim> cpb1(enlargedBlock1);
+	        cp_block<ScalarT2,stencil_size,vmpl_sizes,dim> cpb2(enlargedBlock2);
+
+	        sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
+	        sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
+
+	        __syncthreads();
+
+	        ScalarT1 res1 = 0;
+	        ScalarT2 res2 = 0;
+
+	        if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
+	        {
+	        	int coord[dim];
+
+				unsigned int linIdTmp = offset;
+				for (unsigned int d = 0; d < dim; ++d)
+				{
+					coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
+					linIdTmp /= SparseGridT::blockEdgeSize_;
+				}
+
+	            stencil_conv_func_impl<dim>::stencil2_block(res1,res2,coord,cpb1,cpb2,dataBlockLoad,offset,f,args...);
+
+	            dataBlockStore.template get<p_dst1>()[offset] = res1;
+	            dataBlockStore.template get<p_dst2>()[offset] = res2;
+	        }
+		}
+
+	    template <typename SparseGridT, typename CtxT>
+	    static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
+	    {
+	        // No flush
+	    }
+	};
+
 	template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
 	struct stencil_func_conv2
 	{
@@ -1076,7 +1267,7 @@ namespace SparseGridGpuKernels
 #ifdef SE_CLASS1
 
         if (numCnt > blockDim.x)
-        {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,blockDim.x);}
+        {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,(int)blockDim.x);}
 
 #endif
 
@@ -1164,7 +1355,7 @@ namespace SparseGridGpuKernels
 #ifdef SE_CLASS1
 
         if (numCnt > blockDim.x)
-        {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,blockDim.x);}
+        {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,(int)blockDim.x);}
 
 #endif
 
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
index 14cc99869eda16d3e0a8cfe51acd931d50d214cb..abec6ef5b6b96a058e61d47b8d2e285065d9381b 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
@@ -82,7 +82,6 @@ void testStencilHeatSparse_perf(unsigned int i, std::string base, float fillMult
     sparseGrid.template applyStencils<BoundaryStencilSetXRescaled<dim,0,0>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE,
             centerPoint, centerPoint + 2*blockEdgeSize*gridEdgeSize,
             0.0, 10.0);
-    cudaDeviceSynchronize();
 
     iterations /= 2;
     for (unsigned int iter=0; iter<iterations; ++iter)
diff --git a/src/SparseGridGpu/performance/performancePlots.cpp b/src/SparseGridGpu/performance/performancePlots.cpp
index 6fd461b4c50edba13c92f02b801f439c1a43b78f..01a7092441de8e3d784704bc12319c10266594d3 100644
--- a/src/SparseGridGpu/performance/performancePlots.cpp
+++ b/src/SparseGridGpu/performance/performancePlots.cpp
@@ -59,7 +59,7 @@ void write_test_report(report_sparse_grid_tests &report_sparsegrid_funcs, std::s
 
     StandardXMLPerformanceGraph(file_xml_results, file_xml_ref, cg, 1);
 
-    addUpdtateTime(cg,1);
+    addUpdateTime(cg,1,"data","SparseGridGpu_performance");
     cg.write("SparseGridGpu_performance.html");
 }
 
diff --git a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu
index 907d29c60c382f669f12492eeef9398d57d9cb5b..01aec66f7b07f25262493ab9614b500315106e9a 100644
--- a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu
+++ b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu
@@ -8,7 +8,7 @@
 
 #define BOOST_GPU_ENABLED __host__ __device__
 
-#include "util/cudify/cudify.hpp"
+#include "util/cuda_launch.hpp"
 
 #include "config.h"
 #define BOOST_TEST_DYN_LINK
diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh
index d6ebfd8d68ff2fe1ba444388954c91add026ce61..f1545afe093f6b805a76c2040be12524d54b87b5 100644
--- a/src/Vector/cuda/map_vector_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_cuda_ker.cuh
@@ -257,7 +257,7 @@ namespace openfpm
 		 *
 		 */
 
-		inline __device__ __host__ auto get_o(unsigned int id) const -> decltype(base.get_o(id))
+		inline __device__ __host__ auto get_o(unsigned int id) const -> decltype(base.get_o(grid_key_dx<1>(id)))
 		{
 #ifdef SE_CLASS1
 			if (check_bound(id) == false)
@@ -281,7 +281,7 @@ namespace openfpm
 		 *
 		 */
 
-		inline __device__ __host__ auto get_o(unsigned int id) -> decltype(base.get_o(id))
+		inline __device__ __host__ auto get_o(unsigned int id) -> decltype(base.get_o(grid_key_dx<1>(id)))
 		{
 #ifdef SE_CLASS1
 			if (check_bound(id) == false)
@@ -298,7 +298,7 @@ namespace openfpm
 		 * \return the last element (encapsulated)
 		 *
 		 */
-		inline auto last() const -> decltype(base.get_o(0))
+		inline auto last() const -> decltype(base.get_o(grid_key_dx<1>(0)))
 		{
 			grid_key_dx<1> key(size()-1);
 
@@ -333,7 +333,7 @@ namespace openfpm
 		 * \return the element (encapsulated)
 		 *
 		 */
-		inline auto last() -> decltype(base.get_o(0))
+		inline auto last() -> decltype(base.get_o(grid_key_dx<1>(0)))
 		{
 			grid_key_dx<1> key(size()-1);
 
@@ -453,7 +453,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		__host__ ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const
+		__host__ ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1> start(0);
 			grid_key_dx<1> stop(size()-1);
@@ -465,7 +465,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = 1024) const
+		ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1> start(0);
 			grid_key_dx<1> stop_(stop);
@@ -478,7 +478,7 @@ namespace openfpm
 		 * \param object to copy
 		 *
 		 */
-		vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
+		__host__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
 		{
 			v_size = v.v_size;
 			base = v.base;
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh b/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh
index b3b7fceae6b09f6ff46f4f8f84e3c18fded376b0..b33a093fbad5460860cdc6fc160dedacb339a50b 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh
@@ -317,7 +317,9 @@ namespace openfpm
 			vct_add_index.template get<0>(slot_base*nslot_add+pos) = ele;
 			return vct_add_data.template get<p>(slot_base*nslot_add+pos);
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+
+			printf("vector_sparse_gpu_ker.insert[1]: Error, this function in order to work is supposed to be compiled with nvcc\n");
+
 #endif
 		}
 
@@ -339,7 +341,7 @@ namespace openfpm
 			vct_rem_index.template get<0>(slot_base*nslot_rem+pos) = ele;
 
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.remove: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -362,7 +364,7 @@ namespace openfpm
 
 			return vct_add_data.get(slot_base*nslot_add+pos);
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.insert[2]: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -378,7 +380,7 @@ namespace openfpm
 			vct_rem_index.template get<0>(slot_base*nslot_rem+pos) = ele;
 
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.remove_b: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -395,7 +397,7 @@ namespace openfpm
 			vct_add_index.template get<0>(slot_base*nslot_add+pos) = ele;
 			return vct_add_data.template get<p>(slot_base*nslot_add+pos);
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.insert_b: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -411,7 +413,7 @@ namespace openfpm
 			vct_add_index.template get<0>(slot_base*nslot_add+pos) = ele;
 			return vct_add_data.get(slot_base*nslot_add+pos);
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.insert_b: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -432,7 +434,7 @@ namespace openfpm
 			}
 
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.flush_block_insert: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -453,7 +455,7 @@ namespace openfpm
 			}
 
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.flush_block_remove: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -477,7 +479,7 @@ namespace openfpm
 
 
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.flush_block_insert: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
@@ -500,7 +502,7 @@ namespace openfpm
 			{vct_nrem_index.template get<0>(b) = vct_atomic_rem;}
 
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl;
+			printf("vector_sparse_gpu_ker.flush_block_remove: Error, this function in order to work is supposed to be compiled with nvcc\n");
 #endif
 		}
 
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
index c3697d4208260b8eb283e7ce607b0272e09d7284..6a9a1537154b66558328d4973402883f45e8807d 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
+++ b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
@@ -299,6 +299,12 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add )
 		match &= vs.template get<0>(9000 - 2*i) == 3*(2*i + 3000);
 		match &= vs.template get<1>(9000 - 2*i) == 2*i + 13000;
 		match &= vs.template get<2>(9000 - 2*i) == 2*i + 23000;
+
+		if (match == false)
+		{
+			std::cout << i << "     " <<  vs.template get<0>(9000 - 2*i) << "!=" << 3*(2*i + 3000) << "     " << vs.template get<1>(9000 - 2*i) << "!=" << 2*i + 13000 << "    " << vs.template get<2>(9000 - 2*i) << "!=" << 2*i + 23000 << std::endl;
+			break;
+		}
 	}
 
 	for (size_t i = 0 ; i < 500 ; i++)
@@ -306,6 +312,12 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add )
 		match &= vs.template get<0>(9000 - 2*i) == 3*(2*i + 3000) + 2*i + 1100;
 		match &= vs.template get<1>(9000 - 2*i) == 2*i + 11100;
 		match &= vs.template get<2>(9000 - 2*i) == 2*i + 23000;
+
+		if (match == false)
+		{
+			std::cout << i << "     " <<  vs.template get<0>(9000 - 2*i) << "!=" << 3*(2*i + 3000) << "     " << vs.template get<1>(9000 - 2*i) << "!=" << 2*i + 13000 << "    " << vs.template get<2>(9000 - 2*i) << "!=" << 2*i + 23000 << std::endl;
+			break;
+		}
 	}
 
 	for (size_t i = 0 ; i < 500 ; i++)
@@ -313,6 +325,12 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add )
 		match &= vs.template get<0>(10000 - 2*i) == 2*i + 100;
 		match &= vs.template get<1>(10000 - 2*i) == 2*i + 10100;
 		match &= vs.template get<2>(10000 - 2*i) == 2*i + 20100;
+
+		if (match == false)
+		{
+			std::cout << i << "     " <<  vs.template get<0>(9000 - 2*i) << "!=" << 3*(2*i + 3000) << "     " << vs.template get<1>(9000 - 2*i) << "!=" << 2*i + 13000 << "    " << vs.template get<2>(9000 - 2*i) << "!=" << 2*i + 23000 << std::endl;
+			break;
+		}
 	}
 
 	BOOST_REQUIRE_EQUAL(match,true);
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
index 28d3963ca7d116ecbb65c4a165335a1266f2cba0..33b0a68a390c974565afe1e3fbe4d5dcd2b07c1f 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
+++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
@@ -10,14 +10,15 @@
 
 #ifdef __NVCC__
 
+#include "config.h"
+
 #if CUDART_VERSION < 11000
 #include "util/cuda/cub_old/util_type.cuh"
 #include "util/cuda/cub_old/block/block_scan.cuh"
 #include "util/cuda/moderngpu/operators.hxx"
+#include "util/cuda_launch.hpp"
 #else
-	#if !defined(CUDA_ON_CPU)
-	#include "cub/util_type.cuh"
-	#include "cub/block/block_scan.cuh"
+	#if !defined(CUDA_ON_CPU)	
 	#include "util/cuda/moderngpu/operators.hxx"
 	#endif
 #endif
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
index 6370fbc752a0f49c62660d2290d218f4c2d2ffcb..0dd883bcf5f52f943b588a893afa9a75412f3cc6 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
+++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
@@ -3,9 +3,7 @@
 #include <Vector/map_vector.hpp>
 #include <Vector/cuda/map_vector_sparse_cuda_kernels.cuh>
 #include "util/cuda/scan_ofp.cuh"
-#ifndef CUDA_ON_CPU
-#include "util/cuda/moderngpu/kernel_merge.hxx"
-#endif
+#include "util/cuda/merge_ofp.cuh"
 
 BOOST_AUTO_TEST_SUITE( vector_sparse_cuda_kernels )
 
@@ -176,7 +174,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use )
 	vct_index_old.template hostToDevice<0,1>();
 	vct_add_index.template hostToDevice<0,1>();
 
-	mgpu::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
+	openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
 			    (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
 			    (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),mgpu::less_t<int>(),ctx);
 
@@ -272,7 +270,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use )
 	vct_data_old.template hostToDevice<0,1,2>();
 	vct_add_data.template hostToDevice<0,1,2>();
 
-	mgpu::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
+	openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
 			    (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
 			    (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),mgpu::less_t<int>(),ctx);
 
diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp
index 1118c76ebdd3cc9537ab5af87c4c4d72efb2dfea..cf0c6ef0ef00dad84aac7be7868d52a4992a4f8a 100644
--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -44,6 +44,50 @@
 namespace openfpm
 {
 
+	template<bool active>
+	struct copy_two_vectors_activate_impl
+	{
+		template<typename vector_type1, typename vector_type2>
+		static void copy(vector_type1 & v1, vector_type2 & v2)
+		{
+
+		}
+
+		template<typename vector_type1, typename vector_type2>
+		static void copy2(vector_type1 & v1, vector_type2 & v2)
+		{
+
+		}
+	};
+
+	template<>
+	struct copy_two_vectors_activate_impl<true>
+	{
+		template<typename vector_type1, typename vector_type2>
+		static void copy(vector_type1 & v1, vector_type2 & v2)
+		{
+#ifdef __NVCC__
+			if (v1.size() != 0)
+			{
+				auto it = v1.getGPUIterator();
+				CUDA_LAUNCH(copy_two_vectors,it,v1.toKernel(),v2.toKernel());
+			}
+#endif
+		}
+
+		template<typename vector_type1, typename vector_type2>
+		static void copy2(vector_type1 & v1, vector_type2 & v2)
+		{
+#ifdef __NVCC__
+			if (v2.size() != 0)
+			{
+				auto it = v1.getGPUIterator();
+				CUDA_LAUNCH(copy_two_vectors,it,v1.toKernel(),v2.toKernel());
+			}
+#endif
+		}
+	};
+
 	template<bool is_ok_cuda,typename T, typename Memory,
 			 template<typename> class layout_base,
 			 typename grow_p>
@@ -288,6 +332,16 @@ namespace openfpm
 			return v_size;
 		}
 
+        /*! \brief Return the size of the vector
+        *
+        * \return the size
+        *
+        */
+        size_t size_local() const
+        {
+            return v_size;
+        }
+
 		/*! \brief return the maximum capacity of the vector before reallocation
 		 *
 		 * \return the capacity of the vector
@@ -1095,6 +1149,48 @@ namespace openfpm
 			v_size -= keys.size() - start;
 		}
 
+		/*! \brief Remove several entries from the vector
+		 *
+		 * \warning the keys in the vector MUST be sorted
+		 *
+		 * \param keys objects id to remove
+		 * \param start key starting point
+		 *
+		 */
+		void remove(openfpm::vector<aggregate<int>> & keys, size_t start = 0)
+		{
+			// Nothing to remove return
+			if (keys.size() <= start )
+				return;
+
+			size_t a_key = start;
+			size_t d_k = keys.template get<0>(a_key);
+			size_t s_k = keys.template get<0>(a_key) + 1;
+
+			// keys
+			while (s_k < size())
+			{
+				// s_k should always point to a key that is not going to be deleted
+				while (a_key+1 < keys.size() && s_k == keys.template get<0>(a_key+1))
+				{
+					a_key++;
+					s_k = keys.template get<0>(a_key) + 1;
+				}
+
+				// In case of overflow
+				if (s_k >= size())
+					break;
+
+				set(d_k,get(s_k));
+				d_k++;
+				s_k++;
+			}
+
+			// re-calculate the vector size
+
+			v_size -= keys.size() - start;
+		}
+
 		/*! \brief Get an element of the vector
 		 *
 		 * Get an element of the vector
@@ -1118,23 +1214,14 @@ namespace openfpm
 			return base.template get<p>(key);
 		}
 
-		/*! \brief Get an element of the vector
+		/*! \brief Indicate that this class is not a subset
 		 *
-		 * Get an element of the vector
-		 *
-		 * \param id Element to get
-		 *
-		 * \return the element (encapsulated)
+		 * \return false
 		 *
 		 */
-		inline auto get(size_t id) -> decltype(base.get_o(grid_key_dx<1>(id)))
+		bool isSubset() const
 		{
-#if defined(SE_CLASS1) && !defined(__NVCC__)
-			check_overflow(id);
-#endif
-			grid_key_dx<1> key(id);
-
-			return base.get_o(key);
+			return false;
 		}
 
 		/*! \brief Get an element of the vector
@@ -1146,7 +1233,7 @@ namespace openfpm
 		 * \return the element (encapsulated)
 		 *
 		 */
-		inline auto get(size_t id) const -> const decltype(base.get_o(grid_key_dx<1>(id)))
+		inline auto get(size_t id) -> decltype(base.get_o(grid_key_dx<1>(id)))
 		{
 #if defined(SE_CLASS1) && !defined(__NVCC__)
 			check_overflow(id);
@@ -1225,6 +1312,39 @@ namespace openfpm
 			return base.get_o(key);
 		}
 
+		/*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \tparam p Property to get
+		 * \param id Element to get
+		 *
+		 * \return the element value requested
+		 *
+		 */
+
+		template <unsigned int p,typename KeyType>
+		inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		{
+			return this->template get<p>(id.getKey());
+		}
+
+		/*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \tparam p Property to get
+		 * \param id Element to get
+		 *
+		 * \return the element value requested
+		 *
+		 */
+		template <unsigned int p, typename keyType>
+		inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		{
+			return this->template get<p>(id.getKey());
+		}
+
 		/*! \brief Get an element of the vector
 		 *
 		 * Get an element of the vector
@@ -1247,6 +1367,28 @@ namespace openfpm
 			return base.template get<p>(key);
 		}
 
+		/*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \param id Element to get
+		 *
+		 * \return the element (encapsulated)
+		 *
+		 */
+		inline auto get(size_t id) const -> const decltype(base.get_o(grid_key_dx<1>(id)))
+		{
+#ifdef SE_CLASS2
+			check_valid(this,8);
+#endif
+#if defined(SE_CLASS1) && !defined(__NVCC__)
+			check_overflow(id);
+#endif
+			grid_key_dx<1> key(id);
+
+			return base.get_o(key);
+		}
+
 		/*! \brief Get the last element of the vector
 		 *
 		 * \return the element (encapsulated)
@@ -1278,18 +1420,7 @@ namespace openfpm
 			dup.v_size = v_size;
 			dup.base.swap(base.duplicate());
 
-			// copy the device part
-			// and device
-			if (Memory::isDeviceHostSame() == false)
-			{
-#ifdef __NVCC__
-				if (dup.size() != 0)
-				{
-					auto it = dup.getGPUIterator();
-					CUDA_LAUNCH(copy_two_vectors,it,dup.toKernel(),toKernel());
-				}
-#endif
-			}
+			copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false>::copy(dup,*this);
 
 			return dup;
 		}
@@ -1396,6 +1527,12 @@ namespace openfpm
 			base.set(id,v.base,src);
 		}
 
+		template<typename key_type>
+		key_type getOriginKey(key_type vec_key)
+		{
+			return vec_key;
+		}
+
 
 		/*! \brief Assignment operator
 		 *
@@ -1436,18 +1573,7 @@ namespace openfpm
 				base.set(key,mv.base,key);
 			}
 
-			// and device
-			if (Memory::isDeviceHostSame() == false)
-			{
-#ifdef __NVCC__
-				if (mv.size() != 0)
-				{
-					auto it = mv.getGPUIterator();
-					CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel());
-				}
-#endif
-			}
-
+			copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false>::copy2(*this,mv);
 
 			return *this;
 		}
@@ -1491,17 +1617,7 @@ namespace openfpm
 				base.set(key,mv.getInternal_base(),key);
 			}
 
-			// and device
-			if (Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false)
-			{
-#ifdef __NVCC__
-				if (mv.size() != 0)
-				{
-					auto it = mv.getGPUIterator();
-					CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel());
-				}
-#endif
-			}
+			copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false>::copy2(*this,mv);
 
 			return *this;
 		}
@@ -1550,17 +1666,7 @@ namespace openfpm
 				base.set_general(key,mv.getInternal_base(),key);
 			}
 
-			// and device
-			if (Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false)
-			{
-#ifdef __NVCC__
-				if (mv.size() != 0)
-				{
-					auto it = mv.getGPUIterator();
-					CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel());
-				}
-#endif
-			}
+			copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false>::copy2(*this,mv);
 
 			return *this;
 		}
@@ -1680,7 +1786,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = 1024) const
+		ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1,long int> start(0);
 			grid_key_dx<1,long int> stop_(stop);
@@ -1690,6 +1796,21 @@ namespace openfpm
 
 #endif
 
+		/*! \brief Get the vector elements iterator
+		 *
+		 *
+		 * \return an iterator to iterate through all the elements of the vector
+		 *
+		 */
+
+		vector_key_iterator getDomainIterator() const
+		{
+#ifdef SE_CLASS2
+			check_valid(this,8);
+#endif
+			return getIterator();
+		}
+
 		/*! \brief Get the vector elements iterator
 		 *
 		 *
@@ -1702,13 +1823,28 @@ namespace openfpm
 			return vector_key_iterator(v_size);
 		}
 
+        /*! \brief Get the vector elements iterator
+ *
+ *
+ * \return an iterator to iterate through all the elements of the vector
+ *
+ */
+        template<unsigned int p>
+        vector_key_iterator_ele<p,self_type> getIteratorElements() const
+        {
+#ifdef SE_CLASS2
+            check_valid(this,8);
+#endif
+            return vector_key_iterator_ele<p,self_type>(*this,size());
+        }
+
 #ifdef CUDA_GPU
 
 		/*! \brief Get an iterator for the GPU
 		 *
 		 *
 		 */
-		ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const
+		ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1> start(0);
 			grid_key_dx<1> stop(size()-1);
@@ -1987,6 +2123,7 @@ namespace openfpm
 
 	template <typename T> using vector_std = vector<T, HeapMemory, memory_traits_lin, openfpm::grow_policy_double, STD_VECTOR>;
 	template<typename T> using vector_gpu = openfpm::vector<T,CudaMemory,memory_traits_inte>;
+	template<typename T> using vector_gpu_lin = openfpm::vector<T,CudaMemory,memory_traits_lin>;
 	template<typename T> using vector_gpu_single = openfpm::vector<T,CudaMemory,memory_traits_inte,openfpm::grow_policy_identity>;
 	template<typename T> using vector_custd = vector<T, CudaMemory, memory_traits_inte, openfpm::grow_policy_double, STD_VECTOR>;
 }
diff --git a/src/Vector/map_vector_sparse.hpp b/src/Vector/map_vector_sparse.hpp
index e34e8ea14b00ae7761f8e8d51fba4c7780badc28..95c667d93f179396b816a777d74bccf98a792b1d 100644
--- a/src/Vector/map_vector_sparse.hpp
+++ b/src/Vector/map_vector_sparse.hpp
@@ -17,7 +17,7 @@
 #include <limits>
 
 #if defined(__NVCC__)
-  #if !defined(CUDA_ON_CPU)
+  #if !defined(CUDA_ON_CPU) && !defined(__HIP__)
 	#include "util/cuda/moderngpu/kernel_segreduce.hxx"
 	#include "util/cuda/moderngpu/kernel_merge.hxx"
   #endif
@@ -26,6 +26,8 @@
 
 #include "util/cuda/scan_ofp.cuh"
 #include "util/cuda/sort_ofp.cuh"
+#include "util/cuda/segreduce_ofp.cuh"
+#include "util/cuda/merge_ofp.cuh"
 
 enum flush_type
 {
@@ -151,7 +153,7 @@ namespace openfpm
 
             assert((std::is_same<seg_type,int>::value == true));
 
-            mgpu::segreduce(
+            openfpm::segreduce(
                     (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
                     (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(),
                     (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
@@ -1108,7 +1110,7 @@ namespace openfpm
 			// we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes
 			//
 
-			mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
+			openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
 						(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(),
 						(Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
 
@@ -1325,7 +1327,7 @@ namespace openfpm
 
 			// we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index
 			//
-			mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
+			openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
 						(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(),
 						(Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
 
@@ -1672,8 +1674,9 @@ namespace openfpm
 		 *
 		 */
 		template <unsigned int p>
-		auto insertFlush(Ti ele) -> decltype(vct_data.template get<p>(0))
+		auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.template get<p>(0))
 		{
+			is_new = false;
 			size_t di;
 
 			// first we have to search if the block exist
@@ -1684,7 +1687,8 @@ namespace openfpm
 				// block exist
 				return vct_data.template get<p>(di);
 			}
-
+			is_new = true;
+			
 			// It does not exist, we create it di contain the index where we have to create the new block
 			vct_index.insert(di);
 			vct_data.isert(di);
@@ -1697,8 +1701,9 @@ namespace openfpm
 		 * \param ele element id
 		 *
 		 */
-		auto insertFlush(Ti ele) -> decltype(vct_data.get(0))
+		auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.get(0))
 		{
+			is_new = false;
 			Ti di;
 
 			// first we have to search if the block exist
@@ -1713,6 +1718,7 @@ namespace openfpm
 			// It does not exist, we create it di contain the index where we have to create the new block
 			vct_index.insert(di);
 			vct_data.insert(di);
+			is_new = true;
 
 			vct_index.template get<0>(di) = ele;
 
diff --git a/src/Vector/performance/vector_performance_test.cu b/src/Vector/performance/vector_performance_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..26803ccac979981524be937b40c08ce60ef10e33
--- /dev/null
+++ b/src/Vector/performance/vector_performance_test.cu
@@ -0,0 +1,592 @@
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include "Plot/GoogleChart.hpp"
+#include "timer.hpp"
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/xml_parser.hpp>
+#include "util/performance/performance_util.hpp"
+#include "Point_test.hpp"
+#include "util/stat/common_statistics.hpp"
+
+extern const char * test_dir;
+
+typedef Point_test<float> P;
+
+constexpr int N_STAT = 32;
+
+BOOST_AUTO_TEST_SUITE( performance )
+
+#define NADD 128*128*128
+#define NADD_GPU 256*256*256
+
+// Property tree
+struct report_vector_func_tests
+{
+	boost::property_tree::ptree graphs;
+};
+
+report_vector_func_tests report_vector_funcs;
+
+BOOST_AUTO_TEST_SUITE( vector_performance )
+
+BOOST_AUTO_TEST_CASE(vector_performance)
+{
+	report_vector_funcs.graphs.put("performance.vector(0).funcs.nele",NADD);
+	report_vector_funcs.graphs.put("performance.vector(0).funcs.name","add");
+
+	report_vector_funcs.graphs.put("performance.vector(1).funcs.nele",NADD);
+	report_vector_funcs.graphs.put("performance.vector(1).funcs.name","get");
+
+	std::vector<double> times(N_STAT + 1);
+	std::vector<double> times_g(N_STAT + 1);
+
+	// get test
+	double tot_accu = 0.0;
+
+	for (size_t i = 0 ; i < N_STAT+1 ; i++)
+	{
+		timer t;
+		t.start();
+
+		// create a vector
+		openfpm::vector<Point_test<float>> v1;
+
+		// Point
+		Point_test<float> p;
+		p.setx(1.0);
+		p.sety(2.0);
+		p.setz(3.0);
+		p.sets(4.0);
+
+		p.get<P::v>()[0] = 1.0;
+		p.get<P::v>()[1] = 2.0;
+		p.get<P::v>()[2] = 7.0;
+
+		p.get<P::t>()[0][0] = 10.0;
+		p.get<P::t>()[0][1] = 13.0;
+		p.get<P::t>()[0][2] = 8.0;
+		p.get<P::t>()[1][0] = 19.0;
+		p.get<P::t>()[1][1] = 23.0;
+		p.get<P::t>()[1][2] = 5.0;
+		p.get<P::t>()[2][0] = 4.0;
+		p.get<P::t>()[2][1] = 3.0;
+		p.get<P::t>()[2][2] = 11.0;
+
+		// Add test
+
+		for (size_t j = 0 ; j < NADD ; j++)
+		{
+			v1.add(p);
+		}
+
+		t.stop();
+		times[i] = t.getwct();
+
+		timer tg;
+		tg.start();
+
+		for (size_t j = 0 ; j < NADD ; j++)
+		{
+			double accu1 = v1.template get<P::x>(j);
+			double accu2 = v1.template get<P::y>(j);
+			double accu3 = v1.template get<P::z>(j);
+			double accu4 = v1.template get<P::s>(j);
+
+			double accu5 = v1.template get<P::v>(j)[0];
+			double accu6 = v1.template get<P::v>(j)[1];
+			double accu7 = v1.template get<P::v>(j)[2];
+
+			double accu8 = v1.template get<P::t>(j)[0][0];
+			double accu9 = v1.template get<P::t>(j)[0][1];
+			double accu10 = v1.template get<P::t>(j)[0][2];
+			double accu11 = v1.template get<P::t>(j)[1][0];
+			double accu12 = v1.template get<P::t>(j)[1][1];
+			double accu13 = v1.template get<P::t>(j)[1][2];
+			double accu14 = v1.template get<P::t>(j)[2][0];
+			double accu15 = v1.template get<P::t>(j)[2][1];
+			double accu16 = v1.template get<P::t>(j)[2][2];
+
+			tot_accu += accu1 + accu2 + accu3 + accu4 + accu5 + accu6 + accu7 + accu8 + accu9 + accu10 + accu11 + accu12 +
+					   accu13 + accu14 + accu15 + accu16;
+		}
+
+		tg.stop();
+
+		times_g[i] = tg.getwct();
+	}
+
+	double mean;
+	double dev;
+	standard_deviation(times,mean,dev);
+
+	report_vector_funcs.graphs.put("performance.vector(0).y.data.mean",mean);
+	report_vector_funcs.graphs.put("performance.vector(0).y.data.dev",dev);
+
+	standard_deviation(times_g,mean,dev);
+
+	report_vector_funcs.graphs.put("performance.vector(1).y.data.mean",mean);
+	report_vector_funcs.graphs.put("performance.vector(1).y.data.dev",dev);
+}
+
+template<typename vector_prop_type, typename vector_pos_type>
+__device__ __host__ void read_write(vector_prop_type & vd_prop, vector_pos_type & vd_pos, unsigned int p)
+{
+	vd_prop.template get<0>(p) = vd_pos.template get<0>(p)[0] + vd_pos.template get<0>(p)[1];
+
+	vd_prop.template get<1>(p)[0] = vd_pos.template get<0>(p)[0];
+	vd_prop.template get<1>(p)[1] = vd_pos.template get<0>(p)[1];
+
+	vd_prop.template get<2>(p)[0][0] = vd_pos.template get<0>(p)[0];
+	vd_prop.template get<2>(p)[0][1] = vd_pos.template get<0>(p)[1];
+	vd_prop.template get<2>(p)[1][0] = vd_pos.template get<0>(p)[0] + 
+                                      	   vd_pos.template get<0>(p)[1];
+	vd_prop.template get<2>(p)[1][1] = vd_pos.template get<0>(p)[1] - 
+                                      	   vd_pos.template get<0>(p)[0];
+
+	vd_pos.template get<0>(p)[0] += 0.01f;
+	vd_pos.template get<0>(p)[1] += 0.01f;
+}
+
+template<typename vector_type1, typename vector_type2>
+__global__ void  read_write_ker(vector_type1 v1, vector_type2 v2)
+{
+    unsigned int p = + blockIdx.x * blockDim.x + threadIdx.x;
+
+    read_write(v1,v2,p);
+}
+
+
+struct ele
+{
+	double s;
+	double v[2];
+	double t[2][2];
+};
+
+__device__ __host__ void read_write_lin(double * pos, ele * prp, unsigned int p)
+{
+    prp[p].s = pos[2*p] + pos[2*p+1];
+
+    prp[p].v[0] = pos[2*p];
+    prp[p].v[1] = pos[2*p+1];
+
+    prp[p].t[0][0] = pos[2*p];
+    prp[p].t[0][1] = pos[2*p+1];
+    prp[p].t[1][0] = pos[2*p] + pos[2*p+1];
+    prp[p].t[1][1] = pos[2*p+1] - pos[2*p];
+
+    pos[2*p] += 0.01f;
+    pos[2*p+1] += 0.01f;
+}
+
+
+__global__ void  read_write_lin_ker(double * pos, ele * prp)
+{
+    unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
+
+    read_write_lin(pos,prp,p);
+}
+
+__device__ __host__ void read_write_inte(double * pos, double * prp0, double * prp1, double * prp2, unsigned int p, unsigned int n_pos)
+{
+    prp0[0*n_pos + p] = pos[0*n_pos + p] + pos[1*n_pos+p];
+
+    prp1[0*n_pos + p] = pos[0*n_pos + p];
+    prp1[1*n_pos + p] = pos[1*n_pos + p];
+
+    prp2[0*n_pos*2+0*n_pos + p] = pos[0*n_pos + p];
+    prp2[0*n_pos*2+1*n_pos + p] = pos[1*n_pos + p];
+    prp2[1*n_pos*2+0*n_pos + p] = pos[0*n_pos + p] + 
+                                  pos[1*n_pos + p];
+    prp2[1*n_pos*2+1*n_pos + p] = pos[1*n_pos + p] - 
+                                  pos[0*n_pos + p];
+
+    pos[0*n_pos + p] += 0.01f;
+    pos[1*n_pos + p] += 0.01f;
+}
+
+__global__ void  read_write_inte_ker(double * pos, double * prp0, double * prp1, double * prp2, unsigned int n_pos)
+{
+    unsigned int p = blockIdx.x * blockDim.x + threadIdx.x;
+
+    read_write_inte(pos,prp0,prp1,prp2,p,n_pos);
+}
+
+BOOST_AUTO_TEST_CASE(vector_performance_layout_vs_plain_array)
+{
+    std::vector<double> times(N_STAT + 1);
+    std::vector<double> times_g(N_STAT + 1);
+
+    std::vector<double> times2(N_STAT + 1);
+    std::vector<double> times2_g(N_STAT + 1);
+
+    report_vector_funcs.graphs.put("performance.vector_layout(0).funcs.nele",NADD);
+    report_vector_funcs.graphs.put("performance.vector_layout(0).funcs.name","read_write_lin");
+
+    for (size_t i = 0 ; i < N_STAT+1 ; i++)
+    {
+        // create a vector
+        openfpm::vector<aggregate<double,double[2],double[2][2]>> v1;
+        openfpm::vector<aggregate<double[2]>> v2;
+
+        // Point
+        aggregate<double[2]> p;
+        p.get<0>()[0] = 1.0;
+        p.get<0>()[1] = 2.0;
+
+        aggregate<double,double[2],double[2][2]> pa;
+        pa.get<0>() = 1.0;
+
+        pa.get<1>()[0] = 1.0;
+        pa.get<1>()[1] = 1.0;
+
+        pa.get<2>()[0][0] = 1.0;
+        pa.get<2>()[0][1] = 1.0;
+        pa.get<2>()[1][0] = 1.0;
+        pa.get<2>()[1][1] = 1.0;
+
+        // Add test
+
+        for (size_t j = 0 ; j < NADD ; j++)
+        {
+            v1.add(pa);
+            v2.add(p);
+        }
+
+        timer tg;
+        tg.start();
+
+        for (size_t j = 0 ; j < NADD ; j++)
+        {
+            read_write(v1,v2,j);
+        }
+
+        tg.stop();
+
+        times_g[i] = tg.getwct();
+
+        timer tga;
+        tga.start();
+
+        double * prp = (double *)v1.getPointer<0>();
+        double * pos = (double *)v2.getPointer<0>();
+
+        for (size_t j = 0 ; j < NADD ; j++)
+        {
+            read_write_lin(pos,(struct ele *)prp,j);
+        }
+
+        tga.stop();
+
+        times[i] = tga.getwct();
+    }
+
+    double mean;
+    double dev;
+    standard_deviation(times_g,mean,dev);
+
+    double mean_;
+    double dev_;
+    standard_deviation(times,mean_,dev_);
+
+    report_vector_funcs.graphs.put("performance.vector_layout(0).y.data.mean",mean_/mean);
+
+    // Deviation od x/y = x/y^2 dy + 1/y dx
+
+    report_vector_funcs.graphs.put("performance.vector_layout(0).y.data.dev",mean_/(mean*mean)*dev + dev_ / mean );
+
+    report_vector_funcs.graphs.put("performance.vector_layout(1).funcs.nele",NADD);
+    report_vector_funcs.graphs.put("performance.vector_layout(1).funcs.name","read_write_inte");
+
+    for (size_t i = 0 ; i < N_STAT+1 ; i++)
+    {
+        // create a vector
+        openfpm::vector<aggregate<double,double[2],double[2][2]>,HeapMemory,memory_traits_inte> v1;
+        openfpm::vector<aggregate<double[2]>,HeapMemory,memory_traits_inte> v2;
+
+        // Point
+        aggregate<double[2]> p;
+        p.get<0>()[0] = 1.0;
+        p.get<0>()[1] = 2.0;
+
+        aggregate<double,double[2],double[2][2]> pa;
+        pa.get<0>() = 1.0;
+
+        pa.get<1>()[0] = 1.0;
+        pa.get<1>()[1] = 1.0;
+
+        pa.get<2>()[0][0] = 1.0;
+        pa.get<2>()[0][1] = 1.0;
+        pa.get<2>()[1][0] = 1.0;
+        pa.get<2>()[1][1] = 1.0;
+
+        // Add test
+
+        for (size_t j = 0 ; j < NADD ; j++)
+        {
+            v1.add(pa);
+            v2.add(p);
+        }
+
+        timer tg;
+        tg.start();
+
+        for (size_t j = 0 ; j < NADD ; j++)
+        {
+            read_write(v1,v2,j);
+        }
+
+        tg.stop();
+
+        times2_g[i] = tg.getwct();
+        int sz = v1.size();
+
+        timer tga;
+        tga.start();
+
+        double * prp0 = (double *)v1.getPointer<0>();
+        double * prp1 = (double *)v1.getPointer<1>();
+        double * prp2 = (double *)v1.getPointer<2>();
+
+        double * pos = (double *)v2.getPointer<0>();
+
+        for (size_t j = 0 ; j < NADD ; j++)
+        {
+            read_write_inte(pos,prp0,prp1,prp2,j,sz);
+        }
+
+        tga.stop();
+
+        times2[i] = tga.getwct();
+    }
+
+    double mean2;
+    double dev2;
+    standard_deviation(times2_g,mean2,dev2);
+
+    double mean2_;
+    double dev2_;
+    standard_deviation(times2,mean2_,dev2_);
+
+    report_vector_funcs.graphs.put("performance.vector_layout(1).y.data.mean",mean2_/mean2);
+
+    // Deviation od x/y = x/y^2 dy + 1/y dx
+
+    report_vector_funcs.graphs.put("performance.vector_layout(1).y.data.dev",mean2_/(mean2*mean2)*dev2 + dev2_ / mean2 );
+}
+
+BOOST_AUTO_TEST_CASE(vector_performance_gpu_layout_vs_plain_array)
+{
+    std::vector<double> times(N_STAT + 1);
+    std::vector<double> times_g(N_STAT + 1);
+
+    std::vector<double> times2(N_STAT + 1);
+    std::vector<double> times2_g(N_STAT + 1);
+
+    // get test
+    double tot_accu = 0.0;
+
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).funcs.nele",NADD_GPU);
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).funcs.name","read_write_lin");
+
+    for (size_t i = 0 ; i < N_STAT+1 ; i++)
+    {
+        // create a vector
+        openfpm::vector<aggregate<double,double[2],double[2][2]>,CudaMemory> v1;
+        openfpm::vector<aggregate<double[2]>,CudaMemory> v2;
+
+        // Point
+        aggregate<double[2]> p;
+        p.get<0>()[0] = 1.0;
+        p.get<0>()[1] = 2.0;
+
+        aggregate<double,double[2],double[2][2]> pa;
+        pa.get<0>() = 1.0;
+
+        pa.get<1>()[0] = 1.0;
+        pa.get<1>()[1] = 1.0;
+
+        pa.get<2>()[0][0] = 1.0;
+        pa.get<2>()[0][1] = 1.0;
+        pa.get<2>()[1][0] = 1.0;
+        pa.get<2>()[1][1] = 1.0;
+
+        // Add test
+
+        for (size_t j = 0 ; j < NADD_GPU ; j++)
+        {
+            v1.add(pa);
+            v2.add(p);
+        }
+
+        auto ite = v1.getGPUIterator(1536);
+
+        {
+
+        timer tga;
+        tga.startGPU();
+        CUDA_LAUNCH(read_write_ker,ite,v1.toKernel(),v2.toKernel());
+
+        tga.stopGPU();
+        times_g[i] = tga.getwctGPU();
+        }
+
+        std::cout << "OpenFPM: " << times_g[i] << std::endl;
+
+        timer tga2;
+        tga2.startGPU();
+
+        double * prp = (double *)v1.toKernel().getPointer<0>();
+        double * pos = (double *)v2.toKernel().getPointer<0>();
+
+        CUDA_LAUNCH(read_write_lin_ker,ite,pos,(struct ele *)prp);
+        
+        tga2.stopGPU();
+
+        times[i] = tga2.getwctGPU();
+        std::cout << "Array: " << times[i] << std::endl;
+    }
+
+    double mean;
+    double dev;
+    standard_deviation(times_g,mean,dev);
+
+    double mean_;
+    double dev_;
+    standard_deviation(times,mean_,dev_);
+
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).y.data.mean",mean_/mean);
+
+    // Deviation od x/y = x/y^2 dy + 1/y dx
+
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).y.data.dev",mean_/(mean*mean)*dev + dev_ / mean );
+
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).funcs.nele",NADD);
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).funcs.name","read_write_inte");
+
+    for (size_t i = 0 ; i < N_STAT+1 ; i++)
+    {
+        // create a vector
+        openfpm::vector<aggregate<double,double[2],double[2][2]>,CudaMemory,memory_traits_inte> v1;
+        openfpm::vector<aggregate<double[2]>,CudaMemory,memory_traits_inte> v2;
+
+        // Point
+        aggregate<double[2]> p;
+        p.get<0>()[0] = 1.0;
+        p.get<0>()[1] = 2.0;
+
+        aggregate<double,double[2],double[2][2]> pa;
+        pa.get<0>() = 1.0;
+
+        pa.get<1>()[0] = 1.0;
+        pa.get<1>()[1] = 1.0;
+
+        pa.get<2>()[0][0] = 1.0;
+        pa.get<2>()[0][1] = 1.0;
+        pa.get<2>()[1][0] = 1.0;
+        pa.get<2>()[1][1] = 1.0;
+
+        // Add test
+
+        for (size_t j = 0 ; j < NADD_GPU ; j++)
+        {
+            v1.add(pa);
+            v2.add(p);
+        }
+
+        timer tg;
+        tg.startGPU();
+
+        auto ite = v1.getGPUIterator(1536);
+
+        CUDA_LAUNCH(read_write_ker,ite,v1.toKernel(),v2.toKernel());
+
+        tg.stopGPU();
+
+        times2_g[i] = tg.getwctGPU();
+        std::cout << "OpenFPM inte: " << times2_g[i] << std::endl;
+
+        int sz = v1.size();
+
+        timer tga;
+        tga.startGPU();
+
+        double * prp0 = (double *)v1.toKernel().getPointer<0>();
+        double * prp1 = (double *)v1.toKernel().getPointer<1>();
+        double * prp2 = (double *)v1.toKernel().getPointer<2>();
+
+        double * pos = (double *)v2.toKernel().getPointer<0>();
+
+        CUDA_LAUNCH(read_write_inte_ker,ite,pos,prp0,prp1,prp2,sz);
+
+        tga.stopGPU();
+
+        times2[i] = tga.getwctGPU();
+
+        std::cout << "Array inte: " << times2[i] << std::endl;
+    }
+
+    double mean2;
+    double dev2;
+    standard_deviation(times2_g,mean2,dev2);
+
+    double mean2_;
+    double dev2_;
+    standard_deviation(times2,mean2_,dev2_);
+
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).y.data.mean",mean2_/mean2);
+
+    // Deviation od x/y = x/y^2 dy + 1/y dx
+
+    report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).y.data.dev",mean2_/(mean2*mean2)*dev2 + dev2_ / mean2 );
+}
+
+BOOST_AUTO_TEST_CASE(vector_performance_write_report)
+{
+	// Create a graphs
+
+	report_vector_funcs.graphs.put("graphs.graph(0).type","line");
+	report_vector_funcs.graphs.add("graphs.graph(0).title","Vector add and get");
+	report_vector_funcs.graphs.add("graphs.graph(0).x.title","Tests");
+	report_vector_funcs.graphs.add("graphs.graph(0).y.title","Time seconds");
+	report_vector_funcs.graphs.add("graphs.graph(0).y.data(0).source","performance.vector(#).y.data.mean");
+	report_vector_funcs.graphs.add("graphs.graph(0).x.data(0).source","performance.vector(#).funcs.name");
+	report_vector_funcs.graphs.add("graphs.graph(0).y.data(0).title","Actual");
+	report_vector_funcs.graphs.add("graphs.graph(0).interpolation","lines");
+
+	report_vector_funcs.graphs.put("graphs.graph(1).type","line");
+	report_vector_funcs.graphs.add("graphs.graph(1).title","Vector read write");
+	report_vector_funcs.graphs.add("graphs.graph(1).x.title","Layout");
+	report_vector_funcs.graphs.add("graphs.graph(1).y.title","Time seconds");
+	report_vector_funcs.graphs.add("graphs.graph(1).y.data(0).source","performance.vector_layout(#).y.data.mean");
+	report_vector_funcs.graphs.add("graphs.graph(1).x.data(0).source","performance.vector_layout(#).funcs.name");
+	report_vector_funcs.graphs.add("graphs.graph(1).y.data(0).title","Actual");
+	report_vector_funcs.graphs.add("graphs.graph(1).interpolation","lines");
+
+	report_vector_funcs.graphs.put("graphs.graph(2).type","line");
+	report_vector_funcs.graphs.add("graphs.graph(2).title","Vector GPU read write");
+	report_vector_funcs.graphs.add("graphs.graph(2).x.title","Layout");
+	report_vector_funcs.graphs.add("graphs.graph(2).y.title","Time seconds");
+	report_vector_funcs.graphs.add("graphs.graph(2).y.data(0).source","performance.vector_layout_gpu(#).y.data.mean");
+	report_vector_funcs.graphs.add("graphs.graph(2).x.data(0).source","performance.vector_layout_gpu(#).funcs.name");
+	report_vector_funcs.graphs.add("graphs.graph(2).y.data(0).title","Actual");
+	report_vector_funcs.graphs.add("graphs.graph(2).interpolation","lines");
+
+	boost::property_tree::xml_writer_settings<std::string> settings(' ', 4);
+	boost::property_tree::write_xml("vector_performance_funcs.xml", report_vector_funcs.graphs,std::locale(),settings);
+
+	GoogleChart cg;
+
+	std::string file_xml_ref(test_dir);
+	file_xml_ref += std::string("/openfpm_data/vector_performance_funcs_ref.xml");
+
+	StandardXMLPerformanceGraph("vector_performance_funcs.xml",file_xml_ref,cg);
+
+	addUpdateTime(cg,1,"data","vector_performance_funcs");
+
+	cg.write("vector_performance_funcs.html");
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Vector/performance/vector_performance_test.hpp b/src/Vector/performance/vector_performance_test.hpp
index 1cf7ad3520e65f14ddfb213c2fb84cdc466c6016..d1dfe49198d278cdbbebc9eff420119aec985db2 100644
--- a/src/Vector/performance/vector_performance_test.hpp
+++ b/src/Vector/performance/vector_performance_test.hpp
@@ -392,7 +392,7 @@ BOOST_AUTO_TEST_CASE(vector_performance_write_report)
 
 	StandardXMLPerformanceGraph("vector_performance_funcs.xml",file_xml_ref,cg);
 
-	addUpdtateTime(cg,1);
+	addUpdtateTime(cg,1,"data","vector_performance_funcs");
 
 	cg.write("vector_performance_funcs.html");
 }
diff --git a/src/Vector/vector_map_iterator.hpp b/src/Vector/vector_map_iterator.hpp
index 65b419d91825bda5cd295306c7fdcdf82730d436..78bb4958fb207051c657f5c2e28cc06ef5d2ccee 100644
--- a/src/Vector/vector_map_iterator.hpp
+++ b/src/Vector/vector_map_iterator.hpp
@@ -11,6 +11,97 @@
 namespace openfpm
 {
 	/*! \brief Vector iterator
+ *
+ */
+	template<unsigned int p, typename vector_type>
+	class vector_key_iterator_ele
+	{
+		//! Linearized end element
+		const vector_type & vect;
+
+		size_t end;
+
+	protected:
+
+		//! Actual key
+		size_t gk;
+
+	public:
+
+		/*! \brief Constructor require the size of the vector
+		 *
+		 * \param end point
+		 * \param start starting point
+		 *
+		 */
+		vector_key_iterator_ele(const vector_type & vect, size_t end, size_t start = 0)
+				: vect(vect),end(end),gk(start)
+		{}
+
+
+		/*! \brief Get the next element
+		 *
+		 * Get the next element
+		 *
+		 * \return the next grid_key
+		 *
+		 */
+		vector_key_iterator_ele & operator++()
+		{
+			//! increment the first index
+
+			gk++;
+
+			return *this;
+		}
+
+		/*! \brief Set the dimension
+		 *
+		 * \param d is the dimension (IGNORED is by default 0)
+		 * \param sz set the counter to sz
+		 *
+		 */
+		void set(int d, size_t sz)
+		{
+			// set the counter dim to sz
+
+			gk = sz;
+		}
+
+		/*! \brief Check if there is the next element
+		 *
+		 * Check if there is the next element
+		 *
+		 * \return true if there is the next, false otherwise
+		 *
+		 */
+		bool isNext() const
+		{
+			if (gk < end)
+			{
+				//! we did not reach the end of the grid
+
+				return true;
+			}
+
+			//! we reach the end of the grid
+			return false;
+		}
+
+		/*! \brief Get the actual key
+		 *
+		 * Get the actual key
+		 *
+		 * \return the actual key
+		 *
+		 */
+		auto get() const -> decltype(vect.template get<p>(gk))
+		{
+			return vect.template get<p>(gk);
+		}
+	};
+
+	/*! \brief Vector iterator
 	 *
 	 */
 
diff --git a/src/Vector/vector_test_util.hpp b/src/Vector/vector_test_util.hpp
index 5a69757d1c2180ca377b7bc88d09f5cacbccf92e..6641954d64d9ad5c2324b39f71adb285d18b1308 100644
--- a/src/Vector/vector_test_util.hpp
+++ b/src/Vector/vector_test_util.hpp
@@ -27,7 +27,7 @@ typedef Point_test<float> P;
 
 #include "timer.hpp"
 
-std::vector<Point_orig<float>> allocate_stl()
+static std::vector<Point_orig<float>> allocate_stl()
 {
 	std::vector<Point_orig<float>> v_stl_test;
 
@@ -76,7 +76,7 @@ openfpm::vector<T> allocate_openfpm_primitive(size_t n, size_t fill)
 	return v;
 }
 
-openfpm::vector<Point_test<float>> allocate_openfpm_fill(size_t n, size_t fill)
+static openfpm::vector<Point_test<float>> allocate_openfpm_fill(size_t n, size_t fill)
 {
 	Point_test<float> pt;
 	openfpm::vector<Point_test<float>> v_send;
@@ -175,7 +175,7 @@ template<typename vector> vector allocate_openfpm(size_t n_ele)
 	return v_ofp_test;
 }
 
-openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele)
+static openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele)
 {
 	openfpm::vector<Point_test_prp<float>> v_ofp_test;
 
@@ -242,7 +242,7 @@ openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele)
 }
 
 
-openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > allocate_openfpm_aggregate_with_complex(size_t n_ele)
+static openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > allocate_openfpm_aggregate_with_complex(size_t n_ele)
 {
 	//! [Create add and access]
 	openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > v_ofp_test;
diff --git a/src/Vector/vector_unit_tests.hpp b/src/Vector/vector_unit_tests.hpp
index d13f6ac7c0cc08b905baf9d363eae1333fce5ed7..27ba66962c58d8dc5af30007b5a35f94cb7c0860 100644
--- a/src/Vector/vector_unit_tests.hpp
+++ b/src/Vector/vector_unit_tests.hpp
@@ -156,6 +156,80 @@ template <typename vector> void test_vector_remove()
 	}
 }
 
+
+template <typename vector> void test_vector_remove_aggregate()
+{
+	typedef Point_test<float> p;
+
+	//! [Create push and multiple remove]
+
+	vector v1;
+
+	for (size_t i = 0 ; i < V_REM_PUSH ; i++)
+	{
+		// Point
+		Point_test<float> p;
+		p.setx(i);
+
+		v1.add(p);
+	}
+
+	{
+	openfpm::vector<aggregate<int>> rem;
+	rem.add();
+	rem.last().get<0>() = 0;
+	rem.add();
+	rem.last().get<0>() = 1;
+	rem.add();
+	rem.last().get<0>() = 2;
+	rem.add();
+	rem.last().get<0>() = 3;
+
+	v1.remove(rem);
+	}
+
+	//! [Create push and multiple remove]
+
+	BOOST_REQUIRE_EQUAL(v1.size(),1020ul);
+	BOOST_REQUIRE_EQUAL(v1.template get<p::x>(0),4);
+
+	{
+	openfpm::vector<aggregate<int>> rem;
+	rem.add();
+	rem.last().get<0>() = v1.size()-3;
+	rem.add();
+	rem.last().get<0>() = v1.size()-2;
+	rem.add();
+	rem.last().get<0>() = v1.size()-1;
+	rem.add();
+	rem.last().get<0>() = v1.size();
+
+	v1.remove(rem);
+	}
+
+	BOOST_REQUIRE_EQUAL(v1.size(),1016ul);
+	BOOST_REQUIRE_EQUAL(v1.template get<p::x>(v1.size()-1),1019);
+
+	{
+	openfpm::vector<aggregate<int>> rem;
+	for (size_t i = 0 ; i < (V_REM_PUSH - 8) / 2 ; i++)
+	{
+		rem.add();
+		rem.last().get<0>() = i * 2;
+	}
+	// remove all the even number
+	v1.remove(rem);
+	}
+
+	BOOST_REQUIRE_EQUAL(v1.size(),508ul);
+
+	// Check only odd
+	for (size_t i = 0 ; i < v1.size() ; i++)
+	{
+		BOOST_REQUIRE_EQUAL((size_t)v1.template get<p::x>(v1.size()-1) % 2, 1ul);
+	}
+}
+
 template <typename vector> void test_vector_insert()
 {
 	typedef Point_test<float> p;
@@ -446,6 +520,12 @@ BOOST_AUTO_TEST_CASE(vector_remove )
 	test_vector_remove< openfpm::vector<Point_test<float>,HeapMemory, memory_traits_inte> >();
 }
 
+BOOST_AUTO_TEST_CASE(vector_remove_aggregate )
+{
+	test_vector_remove_aggregate<openfpm::vector<Point_test<float>>>();
+	test_vector_remove_aggregate< openfpm::vector<Point_test<float>,HeapMemory, memory_traits_inte> >();
+}
+
 BOOST_AUTO_TEST_CASE(vector_insert )
 {
 	test_vector_insert<openfpm::vector<Point_test<float>>>();
diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in
index 01c196978f3feb7cb93fd2bc3d0aec46446a3162..b15e33f40c557f801cc251b997c2876e21f71e08 100644
--- a/src/config/config_cmake.h.in
+++ b/src/config/config_cmake.h.in
@@ -4,6 +4,18 @@ ${DEFINE_COVERTY_SCAN}
 /* GPU support */
 ${DEFINE_CUDA_GPU}
 
+/* Define CUDIFY backend */
+${DEFINE_CUDIFY_BACKEND}
+
+/* OpenMP support */
+${DEFINE_HAVE_OPENMP}
+
+/* HIP GPU support */
+${DEFINE_HIP_GPU}
+
+/* HIP Cudify GPU support */
+${DEFINE_CUDIFY_USE_HIP}
+
 /* Debug */
 ${DEFINE_DEBUG} /**/
 
@@ -174,6 +186,9 @@ ${DEFINE_STDC_HEADERS}
 /* If an error occur stop the program */
 ${DEFINE_STOP_ON_ERROR}
 
+/* Garbage injector*/
+${DEFINE_GARBAGE_INJECTOR}
+
 /* Test coverage mode */
 ${DEFINE_TEST_COVERAGE_MODE}
 
@@ -183,4 +198,3 @@ ${DEFINE_TEST_COVERAGE_MODE}
 /* Version number of package */
 #define VERSION "1.0.0"
 
-#define OPENFPM_PDATA
diff --git a/src/data_type/aggregate.hpp b/src/data_type/aggregate.hpp
index 3a80ee31b51a5760ff7217d989ab66aec0478560..e8b61fdb8708bf07db74625dea02028ab78379a2 100644
--- a/src/data_type/aggregate.hpp
+++ b/src/data_type/aggregate.hpp
@@ -304,4 +304,15 @@ namespace openfpm
 	}
 }
 
+template<typename BlockT, typename T>
+struct AggregateAppend
+{
+};
+
+template<typename BlockT, typename ... list>
+struct AggregateAppend<BlockT, aggregate<list ...>>
+{
+    typedef aggregate<list..., BlockT> type;
+};
+
 #endif /* OPENFPM_DATA_SRC_UTIL_AGGREGATE_HPP_ */
diff --git a/src/main.cpp b/src/main.cpp
index fd69236807133256957bcabaf2831de402ceb62f..893167fb884a3377a884127ce7e013bade27a617 100755
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,27 +2,13 @@
 #define FUSION_MAX_VECTOR_SIZE 20
 
 #include "config.h"
+
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 #include "util/math_util_complex.hpp"
 
 #define DISABLE_MPI_WRITTERS
 
-// initialization function:
-bool init_unit_test()
-{
-  return true;
-}
-
-std::vector<int> sieve_spf;
-
-// entry point:
-int main(int argc, char* argv[])
-{
-	openfpm::math::init_getFactorization();
-	return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
-}
-
 #include <boost/fusion/include/mpl.hpp>
 
 #include <iostream>
@@ -55,7 +41,32 @@ int main(int argc, char* argv[])
 #include "NN/VerletList/VerletList_test.hpp"
 #include "Grid/iterators/grid_iterators_unit_tests.cpp"
 #include "util/test/compute_optimal_device_grid_unit_tests.hpp"
+
 #ifdef PERFORMANCE_TEST
 #include "performance.hpp"
 #endif
+
+#ifndef NO_INIT_AND_MAIN
+
+// initialization function:
+bool init_unit_test()
+{
+  return true;
+}
+
+std::vector<int> sieve_spf;
+
+// entry point:
+int main(int argc, char* argv[])
+{
+	openfpm::math::init_getFactorization();
+	return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
+}
+
 #include "unit_test_init_cleanup.hpp"
+
+#endif
+
+
+
+
diff --git a/src/timer.hpp b/src/timer.hpp
index 787223a87a5d0b99988195179e317eab8c080a20..f87a0e1cff42bd94268d70791de7ea96d167d792 100644
--- a/src/timer.hpp
+++ b/src/timer.hpp
@@ -42,14 +42,22 @@ class timer
     clock_t cstop;
 
 #if defined(__NVCC__) && !defined(CUDA_ON_CPU)
-    cudaEvent_t start_g, stop_g;
+    #ifdef __HIP__
+        hipEvent_t start_g, stop_g;
+    #else
+        cudaEvent_t start_g, stop_g;
+    #endif
 #endif
 
     // Fill the stop point
     void check()
     {
-#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) 
+#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__)
+        #ifdef __HIP__
+        hipDeviceSynchronize();
+        #else
         cudaDeviceSynchronize();
+        #endif
 #endif
 
 #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
@@ -153,21 +161,37 @@ public:
 
     void startGPU()
     {
+        #ifdef __HIP__
+        hipEventCreate(&start_g);
+        hipEventRecord(start_g,0);
+        #else
         cudaEventCreate(&start_g);
         cudaEventRecord(start_g,0);
+        #endif
     }
 
     void stopGPU()
     {
-       cudaEventCreate(&stop_g);
-       cudaEventRecord(stop_g,0);
-       cudaEventSynchronize(stop_g);
+        #ifdef __HIP__
+        hipEventCreate(&stop_g);
+        hipEventRecord(stop_g,0);
+        hipEventSynchronize(stop_g);
+        #else
+        cudaEventCreate(&stop_g);
+        cudaEventRecord(stop_g,0);
+        cudaEventSynchronize(stop_g);
+        #endif
     }
 
     double getwctGPU()
     {
         float elapsedTime;
+
+        #ifdef __HIP__
+        hipEventElapsedTime(&elapsedTime, start_g,stop_g);
+        #else
         cudaEventElapsedTime(&elapsedTime, start_g,stop_g);
+        #endif
 
         return elapsedTime;
     }
diff --git a/src/unit_test_init_cleanup.hpp b/src/unit_test_init_cleanup.hpp
index 5157a1345f0aa8afbd74514aedb10d3f97395989..64b0c580b8443817abd07e58ce99c44f7a50c232 100644
--- a/src/unit_test_init_cleanup.hpp
+++ b/src/unit_test_init_cleanup.hpp
@@ -8,6 +8,8 @@
 #ifndef UNIT_TEST_INIT_CLEANUP_HPP_
 #define UNIT_TEST_INIT_CLEANUP_HPP_
 
+#include "util/cudify/cudify.hpp"
+
 //! boost unit test fixation (start procedure to call before testing)
 struct ut_start
 {
diff --git a/src/util/common.hpp b/src/util/common.hpp
index 6b8a12a5d9d8a34d23ae9fc8ce956a8e2f24a479..cbcff1361e4eb13d9e829ef9175bdefc9c7cd0b7 100644
--- a/src/util/common.hpp
+++ b/src/util/common.hpp
@@ -13,6 +13,7 @@ constexpr int RUN_ON_DEVICE = 1024;
                                + __GNUC_MINOR__ * 100 \
                                + __GNUC_PATCHLEVEL__)
 
+
 template<unsigned int N,typename T>
 struct static_array
 {
@@ -330,17 +331,16 @@ struct is_openfpm_native<T,true> : std::true_type
 template<typename T, typename Sfinae = void>
 struct has_value_type_ofp: std::false_type {};
 
-/*! \brief has_value_type check if a type has defined a member value_type
+/*! \brief has_value_type_ofp check if a type has defined a member value_type
  *
  * ### Example
  *
- * \snippet util_test.hpp Check has_value_type
+ * \snippet util_test.hpp Check has_value_type_ofp
  *
  * return true if T::value_type is a valid type
  *
  */
 template<typename T>
-//struct has_value_type<T, typename Void<decltype( typename T::value_type )>::type> : std::true_type
 struct has_value_type_ofp<T, typename Void< typename T::value_type>::type> : std::true_type
 {};
 
diff --git a/src/util/copy_compare/copy_general.hpp b/src/util/copy_compare/copy_general.hpp
index 5b1c75c938198e2b3b08e2c038b6991476fc82f1..0a26a43452ab8a592606014277991fd095a54eb6 100644
--- a/src/util/copy_compare/copy_general.hpp
+++ b/src/util/copy_compare/copy_general.hpp
@@ -80,7 +80,8 @@ struct add_
 	}
 };
 
-#ifdef __clang__
+#if defined(__clang__) && !defined(CUDA_ON_CPU)
+
 template<typename Tsrc>
 __host__ Tsrc atomicAdd(Tsrc * ptr, Tsrc value)
 {
diff --git a/src/util/copy_compare/meta_copy.hpp b/src/util/copy_compare/meta_copy.hpp
index 394c209b146dde26dd54e2cdcc69b3b6c0a7e307..96e7a85a05c01a765f6179fb36893738959e908a 100644
--- a/src/util/copy_compare/meta_copy.hpp
+++ b/src/util/copy_compare/meta_copy.hpp
@@ -5,6 +5,28 @@
 #include "util/cuda_util.hpp"
 #include "util/multi_array_openfpm/multi_array_ref_openfpm.hpp"
 
+
+template<typename ArrTypeView>
+struct std_array_vector_view
+{
+	int pos;
+	ArrTypeView arr;
+
+	std_array_vector_view(int pos,ArrTypeView arr)
+	:pos(pos),arr(arr)
+	{}
+
+	decltype(arr[0][0]) operator[](int comp)
+	{
+		return arr[comp][pos];
+	}
+
+	decltype(std::declval<const ArrTypeView>()[0][0]) operator[](int comp) const
+	{
+		return arr[comp][pos];
+	}
+};
+
 /*! \brief This class copy general objects
  *
  * * primitives
@@ -102,6 +124,21 @@ struct meta_copy<T[N1]>
 		}
 	}
 
+	/*! \brief copy and object from src to dst
+	 *
+	 * \param src source object to copy
+	 * \param dst destination object
+	 *
+	 */
+	template<typename T2>
+	__device__ __host__ static inline void meta_copy_(const T src[N1], std_array_vector_view<T2> dst)
+	{
+		for (size_t i1 = 0 ; i1 < N1 ; i1++)
+		{
+			copy_general<T>(src[i1],dst[i1]);
+		}
+	}
+
 	/*! \brief copy and object from src to dst
 	 *
 	 * \param src source object to copy
diff --git a/src/util/cuda/merge_ofp.cuh b/src/util/cuda/merge_ofp.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2c9250053ffe7d0b05b93cf788248703baebd36d
--- /dev/null
+++ b/src/util/cuda/merge_ofp.cuh
@@ -0,0 +1,112 @@
+/*
+ * segreduce_ofp.hpp
+ *
+ *  Created on: May 15, 2019
+ *      Author: i-bird
+ */
+
+ #ifndef MERGE_OFP_HPP_
+ #define MERGE_OFP_HPP_
+ 
+ #ifdef __NVCC__
+ 
+ #include "Vector/map_vector.hpp"
+ #include "util/cuda_launch.hpp"
+ 
+ #if CUDART_VERSION >= 11000
+     #ifndef CUDA_ON_CPU 
+     // Here we have for sure CUDA >= 11
+     #ifdef __HIP__
+        #undef __CUDACC__
+        #undef __CUDA__
+        #include <thrust/merge.h>
+        #include <thrust/execution_policy.h>
+        #define __CUDACC__
+        #define __CUDA__
+     #else
+        #include "util/cuda/moderngpu/kernel_merge.hxx"
+     #endif
+     #endif
+ #else
+    #include "util/cuda/moderngpu/kernel_merge.hxx"
+ #endif
+ #include "util/cuda/ofp_context.hxx"
+ 
+
+ namespace openfpm
+ {
+    template<typename a_keys_it, typename a_vals_it,
+             typename b_keys_it, typename b_vals_it,
+             typename c_keys_it, typename c_vals_it,
+             typename comp_t, typename context_t>
+    void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
+               b_keys_it b_keys, b_vals_it b_vals, int b_count,
+            c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) 
+    {
+ #ifdef CUDA_ON_CPU
+ 
+        int a_it = 0;
+        int b_it = 0;
+        int c_it = 0;
+
+        while (a_it < a_count || b_it < b_count)
+        {
+            if (a_it < a_count)
+            {
+                if (b_it < b_count)
+                {
+                    if (comp(b_keys[b_it],a_keys[a_it]))
+                    {
+                        c_keys[c_it] = b_keys[b_it];
+                        c_vals[c_it] = b_vals[b_it];
+                        c_it++;
+                        b_it++;
+                    }
+                    else
+                    {
+                        c_keys[c_it] = a_keys[a_it];
+                        c_vals[c_it] = a_vals[a_it];
+                        c_it++;
+                        a_it++;
+                    }
+                }
+                else
+                {
+                    c_keys[c_it] = a_keys[a_it];
+                    c_vals[c_it] = a_vals[a_it];
+                    c_it++;
+                    a_it++;
+                }
+            }
+            else
+            {
+                c_keys[c_it] = b_keys[b_it];
+                c_vals[c_it] = b_vals[b_it];
+                c_it++;
+                b_it++;
+            }
+        }
+ 
+ #else
+
+        #ifdef __HIP__
+
+            thrust::merge_by_key(thrust::device, a_keys,a_keys + a_count, 
+                                                 b_keys,b_keys + b_count, 
+                                                 a_vals,b_vals,
+                                                 c_keys,c_vals,comp);
+
+        #else
+
+            mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context);
+
+        #endif
+
+ #endif
+    }
+ }
+ 
+ #endif /* __NVCC__ */
+ 
+ #endif /* SCAN_OFP_HPP_ */
+ 
\ No newline at end of file
diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu
index 167a7103c6afac34e675b9f40450375facb633df..383d46ab7bb6ea196a498d6bc1ab27421bfa599c 100644
--- a/src/util/cuda/modern_gpu_tests.cu
+++ b/src/util/cuda/modern_gpu_tests.cu
@@ -7,6 +7,7 @@
 
 #ifndef CUDA_ON_CPU
 
+#ifndef __HIP__
 #include "util/cuda/moderngpu/kernel_load_balance.hxx"
 #include "util/cuda/moderngpu/kernel_mergesort.hxx"
 #include "util/cuda/moderngpu/kernel_reduce.hxx"
@@ -15,7 +16,7 @@
 
 BOOST_AUTO_TEST_SUITE( modern_gpu_tests )
 
-BOOST_AUTO_TEST_CASE( modern_gpu_transform_lbs )
+BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs )
 {
 	std::cout << "Test modern gpu test tansform_lbs" << "\n";
 
@@ -217,3 +218,5 @@ BOOST_AUTO_TEST_SUITE_END()
 
 #endif
 
+#endif
+
diff --git a/src/util/cuda/moderngpu/context.hxx b/src/util/cuda/moderngpu/context.hxx
index 6df90491e88280d7a309ed72820c052bbd6bab1c..93af53d9ac79ac8ee66ef08144d2e1bfc551fa1a 100644
--- a/src/util/cuda/moderngpu/context.hxx
+++ b/src/util/cuda/moderngpu/context.hxx
@@ -86,7 +86,7 @@ protected:
   template<int dummy_arg = 0>
   void init() {
     cudaFuncAttributes attr;
-    cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>);
+    cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>);
     if(cudaSuccess != result) throw cuda_exception_t(result);
     _ptx_version = attr.ptxVersion;
 
diff --git a/src/util/cuda/moderngpu/cta_load_balance.hxx b/src/util/cuda/moderngpu/cta_load_balance.hxx
index 4629b0e4cf7398d944ed8bd0623b200e9cfe2a8a..c397b7893d1c8435427477f647419f7c72f2e37a 100644
--- a/src/util/cuda/moderngpu/cta_load_balance.hxx
+++ b/src/util/cuda/moderngpu/cta_load_balance.hxx
@@ -135,7 +135,7 @@ struct cta_load_balance_t {
     int* a_shared = storage.indices - range.a_begin;
     int* b_shared = storage.indices + range.a_count();
 
-    lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, range, 
+    lbs_placement_t placement  = cta_load_balance_place<nt, vt>(tid, range, 
       count, segments, num_segments, b_shared);
 
     // Adjust the b pointer by the loaded b_begin. This lets us dereference it
diff --git a/src/util/cuda/moderngpu/cta_segscan.hxx b/src/util/cuda/moderngpu/cta_segscan.hxx
index d2b781ee09ef0a4eb76239f4a12fe05661c9cc10..e8738c5c3a8c4972537266665635c01fd2af611e 100644
--- a/src/util/cuda/moderngpu/cta_segscan.hxx
+++ b/src/util/cuda/moderngpu/cta_segscan.hxx
@@ -30,18 +30,31 @@ struct cta_segscan_t {
     int warp_mask = 0xffffffff>> (31 - lane);   // inclusive search.
     int cta_mask = 0x7fffffff>> (31 - lane);    // exclusive search.
 
+    #ifdef __HIP__
+    // Build a head flag bitfield and store it into shared memory.
+    long int warp_bits = ballot(has_head_flag);
+    storage.delta[warp] = (int)warp_bits;
+    #else
     // Build a head flag bitfield and store it into shared memory.
     int warp_bits = ballot(has_head_flag);
     storage.delta[warp] = warp_bits;
+    #endif
+
+
     __syncthreads();
 
     if(tid < num_warps) {
+      #ifdef __HIP__
+      int cta_bits = (int)ballot(0 != storage.delta[tid]);
+      #else
       unsigned mask = __activemask();
       int cta_bits = ballot(0 != storage.delta[tid], mask);
+      #endif
       int warp_segment = 31 - clz(cta_mask & cta_bits);
       int start = (-1 != warp_segment) ?
         (31 - clz(storage.delta[warp_segment]) + 32 * warp_segment) : 0;
       storage.delta[num_warps + tid] = start;
+
     }
     __syncthreads();
 
diff --git a/src/util/cuda/moderngpu/intrinsics.hxx b/src/util/cuda/moderngpu/intrinsics.hxx
index efe881a1d25e0709a9fd973d0a4cb596e0518cf7..39af8b6e6cf7702d6b81f373d49c9025f34da27f 100644
--- a/src/util/cuda/moderngpu/intrinsics.hxx
+++ b/src/util/cuda/moderngpu/intrinsics.hxx
@@ -2,7 +2,7 @@
 
 #include "operators.hxx"
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIP__)
 #error "You must compile this file with nvcc. You must."
 #endif
 
diff --git a/src/util/cuda/moderngpu/meta.hxx b/src/util/cuda/moderngpu/meta.hxx
index 6fd8ccec007356fc1dd02759878fba46cc2f7124..369c303e4f79438b06ab8ff88a453aa66af64ffa 100644
--- a/src/util/cuda/moderngpu/meta.hxx
+++ b/src/util/cuda/moderngpu/meta.hxx
@@ -14,7 +14,7 @@
   #define MGPU_HOST_DEVICE __forceinline__ __device__ __host__
 #endif
 
-#ifndef MGPU_DEVICE 
+#ifndef MGPU_DEVICE
   #define MGPU_DEVICE __device__
 #endif
 
diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx
index 617394d03af26eb10d5a10ecbd4d3c5858aa1519..99295a818ef7ff9b811402505a07e13bbd094358 100644
--- a/src/util/cuda/moderngpu/transform.hxx
+++ b/src/util/cuda/moderngpu/transform.hxx
@@ -1,6 +1,7 @@
 // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 #pragma once
 
+
 #include <random>
 #include <algorithm>
 #include <cuda.h>
@@ -19,8 +20,10 @@ void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) {
     grid_dim = dim3(256, div_up(num_ctas, 256));
   
   if(num_ctas)
+  {
     launch_box_cta_k<launch_box, func_t>
-      <<<grid_dim, cta.nt, 0, context.stream()>>>(f, num_ctas, args...);
+      <<<grid_dim, cta.nt,0,context.stream()>>>(f, num_ctas, args...);
+  }
 }
 
 template<int nt, int vt = 1, typename func_t, typename... args_t>
diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx
index 47d5327e11be0adc0067f49f04efb8a2482dff09..70c4ed9e48b09305680aaaf69ce88df3aa564d0b 100644
--- a/src/util/cuda/ofp_context.hxx
+++ b/src/util/cuda/ofp_context.hxx
@@ -113,9 +113,11 @@ namespace mgpu
 
 	#ifdef CUDA_GPU
 
-		#ifdef __NVCC__ 
-
+		#ifdef __NVCC__
 		#include "util/cuda/moderngpu/context.hxx"
+		#else
+		#include "util/cuda/moderngpu/context_reduced.hxx"
+		#endif
 
 		namespace mgpu
 		{
@@ -151,9 +153,14 @@ namespace mgpu
 					void init(int dev_num, gpu_context_opt opt)
 					{
 						cudaFuncAttributes attr;
-						cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>);
+						#ifdef __NVCC__
+						cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>);
 						if(cudaSuccess != result) throw cuda_exception_t(result);
 						_ptx_version = attr.ptxVersion;
+						#else
+						_ptx_version = 60;
+						//std::cout << __FILE__ << ":" << __LINE__ << " Warning initialization of GPU context has been done from a standard Cpp file, rather than a CUDA or HIP file" << std::endl;
+						#endif
 
 						int num_dev;
 						cudaGetDeviceCount(&num_dev);
@@ -286,163 +293,6 @@ namespace mgpu
 
 		}
 
-		#else
-
-		#include "util/cuda/moderngpu/context_reduced.hxx"
-
-		namespace mgpu
-		{
-			enum gpu_context_opt
-			{
-				no_print_props,//!< no_print_props
-				print_props,   //!< print_props
-				dummy          //!< dummy
-			};
-
-
-			////////////////////////////////////////////////////////////////////////////////
-			// standard_context_t is a trivial implementation of context_t. Users can
-			// derive this type to provide a custom allocator.
-
-			class ofp_context_t : public context_t
-			{
-				protected:
-					cudaDeviceProp _props;
-					int _ptx_version;
-					cudaStream_t _stream;
-
-					cudaEvent_t _timer[2];
-					cudaEvent_t _event;
-
-					openfpm::vector<aggregate<unsigned char>> tmem;
-
-					// Making this a template argument means we won't generate an instance
-					// of dummy_k for each translation unit.
-					template<int dummy_arg = 0>
-					void init(int dev_num, gpu_context_opt opt)
-					{
-						cudaFuncAttributes attr;
-
-						_ptx_version = 0;
-
-						int num_dev;
-						cudaGetDeviceCount(&num_dev);
-
-						if (num_dev == 0) {return;}
-
-						if (opt != gpu_context_opt::dummy)
-						{
-							cudaSetDevice(dev_num % num_dev);
-						}
-
-						int ord;
-						cudaGetDevice(&ord);
-						cudaGetDeviceProperties(&_props, ord);
-
-						cudaEventCreate(&_timer[0]);
-						cudaEventCreate(&_timer[1]);
-						cudaEventCreate(&_event);
-					}
-
-				public:
-
-					/*! \brief gpu context constructor
-					*
-					* \param opt options for this gpu context
-					*
-					*/
-					ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0)
-					:context_t(), _stream(stream_)
-					{
-						init(dev_num,opt);
-						if(opt == gpu_context_opt::print_props)
-						{
-							printf("%s\n", device_prop_string(_props).c_str());
-						}
-					}
-
-					~ofp_context_t()
-					{
-						cudaEventDestroy(_timer[0]);
-						cudaEventDestroy(_timer[1]);
-						cudaEventDestroy(_event);
-					}
-
-					virtual const cudaDeviceProp& props() const
-					{
-						return _props;
-					}
-
-					virtual int ptx_version() const
-					{
-						std::cout << __FILE__ << ":" << __LINE__ << " error to use this function you must compile the class ofp_context_t with NVCC" << std::endl;
-						return 0;
-					}
-
-					virtual cudaStream_t stream() { return _stream; }
-
-					// Alloc GPU memory.
-					virtual void* alloc(size_t size, memory_space_t space)
-					{
-						void* p = nullptr;
-						if(size)
-						{
-							cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size);
-							if(cudaSuccess != result) throw cuda_exception_t(result);
-						}
-						return p;
-					}
-
-					virtual void free(void* p, memory_space_t space)
-					{
-						if(p)
-						{
-							cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p);
-							if(cudaSuccess != result) throw cuda_exception_t(result);
-						}
-					}
-
-					virtual void synchronize()
-					{
-						cudaError_t result = _stream ?
-						cudaStreamSynchronize(_stream) :
-						cudaDeviceSynchronize();
-						if(cudaSuccess != result) throw cuda_exception_t(result);
-					}
-
-					virtual cudaEvent_t event()
-					{
-						return _event;
-					}
-
-					virtual void timer_begin()
-					{
-						cudaEventRecord(_timer[0], _stream);
-					}
-
-					virtual double timer_end()
-					{
-						cudaEventRecord(_timer[1], _stream);
-						cudaEventSynchronize(_timer[1]);
-						float ms;
-						cudaEventElapsedTime(&ms, _timer[0], _timer[1]);
-						return ms / 1.0e3;
-					}
-
-					virtual int getDevice()
-					{
-						int dev = 0;
-
-						cudaGetDevice(&dev);
-
-						return dev;
-					}
-			};
-
-		}
-
-		#endif
-
 	#else
 
 		namespace mgpu
diff --git a/src/util/cuda/reduce_ofp.cuh b/src/util/cuda/reduce_ofp.cuh
index 7c4ecb0f7fb4375f3836c06274bf24af41302229..a9f2cf2ef5d57154019ffb9ed7f9c0b2d96b45bd 100644
--- a/src/util/cuda/reduce_ofp.cuh
+++ b/src/util/cuda/reduce_ofp.cuh
@@ -15,7 +15,11 @@
 #if CUDART_VERSION >= 11000
 	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#include "cub/cub.cuh"
+	#ifdef __HIP__
+		#include "hipcub/hipcub.hpp"
+	#else
+		#include "cub/cub.cuh"
+	#endif
 	#ifndef REDUCE_WITH_CUB
 		#define REDUCE_WITH_CUB
 	#endif
@@ -44,9 +48,11 @@ namespace openfpm
 #else
 	#ifdef REDUCE_WITH_CUB
 
+		#ifdef __HIP__
+
 			void *d_temp_storage = NULL;
 			size_t temp_storage_bytes = 0;
-			cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
+			hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
 																		output,
 																		count,
 																		op,
@@ -56,11 +62,32 @@ namespace openfpm
 			temporal.resize(temp_storage_bytes);
 
 			// Run
-			cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
+			hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
 					output,
 					count,
 					op,
 					false);
+		#else
+
+			void *d_temp_storage = NULL;
+			size_t temp_storage_bytes = 0;
+			cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
+																	output,
+																	count,
+																	op,
+																	false);
+
+			auto & temporal = context.getTemporalCUB();
+			temporal.resize(temp_storage_bytes);
+
+			// Run
+			cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
+				output,
+				count,
+				op,
+				false);
+
+		#endif
 
 	#else
 			mgpu::reduce(input,count,output,op,context);
diff --git a/src/util/cuda/scan_cuda.cuh b/src/util/cuda/scan_cuda.cuh
deleted file mode 100644
index f80ad87c0b3ebfd9656f8a1ea569d488672232cd..0000000000000000000000000000000000000000
--- a/src/util/cuda/scan_cuda.cuh
+++ /dev/null
@@ -1,532 +0,0 @@
-/*
- * scan_cuda.cuh
- *
- *  Created on: Jun 28, 2018
- *      Author: i-bird
- */
-
-#ifndef SCAN_CUDA_CUH_
-#define SCAN_CUDA_CUH_
-
-#include "Vector/map_vector.hpp"
-
-#ifdef __NVCC__
-
-////////////////// Ratio reduction ///////////////////////////////////////////////////////////////
-
-/*template<typename cnt_type, typename ids_type, unsigned int rt = sizeof(cnt_type)/sizeof(ids_type)>
-struct ratio_reduction
-{
-	static inline __device__ cnt_type reduction(cnt_type * rd)
-	{
-		return -1;
-	}
-};
-
-template<typename cnt_type, typename ids_type>
-struct ratio_reduction<cnt_type,ids_type,1>
-{
-	static inline __device__ unsigned int reduction(cnt_type * val)
-	{
-		return val[0] + val[1] + val[2] + val[3];
-	}
-};
-
-template<typename cnt_type, typename ids_type>
-struct ratio_reduction<cnt_type,ids_type,2>
-{
-	static inline __device__ unsigned int reduction(cnt_type * val)
-	{
-	    val[0] = ((val[0] & 0xFFFF0000) >> 16) + (val[0] & 0xFFFF);
-	    val[1] = ((val[1] & 0xFFFF0000) >> 16) + (val[1] & 0xFFFF);
-	    val[2] = ((val[2] & 0xFFFF0000) >> 16) + (val[2] & 0xFFFF);
-	    val[3] = ((val[3] & 0xFFFF0000) >> 16) + (val[3] & 0xFFFF);
-	    return val[0] + val[1] + val[2] + val[3];
-	}
-};
-
-template<typename cnt_type, typename ids_type>
-struct ratio_reduction<cnt_type,ids_type,4>
-{
-	static inline __device__ unsigned int reduction(cnt_type * val)
-	{
-	    val[0] = ((val[0] & 0xFF000000) >> 24) + ((val[0] & 0xFF0000) >> 16) + ((val[0] & 0xFF00) >> 8) + (val[0] & 0xFF);
-	    val[1] = ((val[1] & 0xFF000000) >> 24) + ((val[1] & 0xFF0000) >> 16) + ((val[1] & 0xFF00) >> 8) + (val[1] & 0xFF);
-	    val[2] = ((val[2] & 0xFF000000) >> 24) + ((val[2] & 0xFF0000) >> 16) + ((val[2] & 0xFF00) >> 8) + (val[2] & 0xFF);
-	    val[3] = ((val[3] & 0xFF000000) >> 24) + ((val[3] & 0xFF0000) >> 16) + ((val[3] & 0xFF00) >> 8) + (val[3] & 0xFF);
-	    return val[0] + val[1] + val[2] + val[3];
-	}
-};
-
-
-///////////////////////// Ratio extends ////////////////////////////////////////////////////////////////////
-
-
-template<typename T>
-struct to_four_type
-{
-	typedef T type;
-};
-
-template<>
-struct to_four_type<unsigned int>
-{
-	typedef uint4 type;
-};
-
-template<>
-struct to_four_type<int>
-{
-	typedef int4 type;
-};
-
-template<typename T>
-struct make4
-{
-	__device__ inline static T make()
-	{
-		return 0;
-	}
-};
-
-template<>
-struct make4<uint4>
-{
-	__device__ inline static uint4 make()
-	{
-		return make_uint4(0,0,0,0);
-	}
-};
-
-template<>
-struct make4<int4>
-{
-	__device__ inline static int4 make()
-	{
-		return make_int4(0,0,0,0);
-	}
-};
-
-template<typename cnt_type, typename ids_type, unsigned int rt = sizeof(cnt_type)/sizeof(ids_type)>
-struct ratio_extend
-{
-	static inline __device__ cnt_type reduction(ids_type * rd)
-	{
-		return -1;
-	}
-};
-
-template<typename cnt_type, typename ids_type>
-struct ratio_extend<cnt_type,ids_type,1>
-{
-	typedef boost::mpl::int_<1> size;
-
-	typedef cnt_type cnt_type_;
-	typedef typename to_four_type<cnt_type>::type cnt_type4;
-
-	static inline __device__ void extend(cnt_type4 * val)
-	{
-		unsigned int tmp = val[0].w;
-	    val[0].w = val[0].z;
-	    val[0].z = val[0].y;
-	    val[0].y = val[0].x;
-	    val[0].x = tmp;
-	}
-
-	static inline __device__ unsigned int reduce(cnt_type4 * tu4, cnt_type4 * val)
-	{
-		tu4->x = val[0].x + val[0].y + val[0].z + val[0].w;
-
-	    return tu4->x;
-	}
-
-	static inline __device__ void scan(unsigned int woff_w, cnt_type4 tu4, cnt_type4 * val)
-	{
-		unsigned int lps;
-	    lps = woff_w + tu4.w;
-
-	    val[0].x = lps;
-
-	    val[0].y += val[0].x;
-
-	    val[0].z += val[0].y;
-
-	    val[0].w += val[0].z;
-	}
-
-	static inline __device__ cnt_type4 zero()
-	{
-		return make4<cnt_type4>::make();
-	}
-};
-
-
-
-template<typename cnt_type, typename ids_type>
-struct ratio_extend<cnt_type,ids_type,2>
-{
-	typedef boost::mpl::int_<2> size;
-
-	typedef cnt_type cnt_type_;
-	typedef typename to_four_type<cnt_type>::type cnt_type4;
-
-	static inline __device__ void extend(cnt_type4 * val)
-	{
-	    val[1].w = (val[0].w & 0x0000FFFF);
-	    val[1].z = (val[0].z & 0xFFFF0000) >>  16;
-
-	    val[1].y = (val[0].z & 0x0000FFFF);
-	    val[1].x = (val[0].w & 0xFFFF0000) >>  16;
-
-	    val[0].w = (val[0].y & 0x0000FFFF);
-	    val[0].z = (val[0].x & 0xFFFF0000) >>  16;
-
-	    short int tmp = (val[0].y & 0xFFFF0000) >>  16;
-	    val[0].y = (val[0].x & 0x0000FFFF);
-	    val[0].x = tmp;
-	}
-
-	static inline __device__ unsigned int reduce(cnt_type4 * tu4, cnt_type4 * val)
-	{
-	    tu4->x = val[0].x + val[0].y + val[0].z + val[0].w;
-	    tu4->y = val[1].x + val[1].y + val[1].z + val[1].w;
-
-	    return tu4->x + tu4->y;
-	}
-
-	static inline __device__ void scan(unsigned int woff_w, cnt_type4 tu4, cnt_type4 * val)
-	{
-		uint2 lps;
-	    lps.x = woff_w + tu4.w;
-	    lps.y = lps.x + tu4.x;
-
-	    val[0].x = lps.x;
-	    val[1].x = lps.y;
-
-	    val[0].y += val[0].x;
-	    val[1].y += val[1].x;
-
-	    val[0].z += val[0].y;
-	    val[1].z += val[1].y;
-
-	    val[0].w += val[0].z;
-	    val[1].w += val[1].z;
-	}
-
-	static inline __device__ cnt_type4 zero()
-	{
-		return make4<cnt_type4>::make();
-	}
-};
-
-template<typename cnt_type, typename ids_type>
-struct ratio_extend<cnt_type,ids_type,4>
-{
-	typedef boost::mpl::int_<4> size;
-
-	typedef cnt_type cnt_type_;
-	typedef typename to_four_type<cnt_type>::type cnt_type4;
-
-	static inline __device__ void extend(cnt_type4 * val)
-	{
-	    val[3].y = (val[0].w & 0x000000FF);
-	    val[3].z = (val[0].w & 0x0000FF00) >>  8;
-	    val[3].w = (val[0].w & 0x00FF0000) >> 16;
-	    val[3].x = (val[0].w & 0xFF000000) >> 24;
-
-	    val[2].y = (val[0].z & 0x000000FF);
-	    val[2].z = (val[0].z & 0x0000FF00) >>  8;
-	    val[2].w = (val[0].z & 0x00FF0000) >> 16;
-	    val[2].x = (val[0].z & 0xFF000000) >> 24;
-
-	    val[1].y = (val[0].y & 0x000000FF);
-	    val[1].z = (val[0].y & 0x0000FF00) >>  8;
-	    val[1].w = (val[0].y & 0x00FF0000) >> 16;
-	    val[1].x = (val[0].y & 0xFF000000) >> 24;
-
-	    val[0].y = (val[0].x & 0x000000FF);
-	    val[0].z = (val[0].x & 0x0000FF00) >>  8;
-	    val[0].w = (val[0].x & 0x00FF0000) >> 16;
-	    val[0].x = (val[0].x & 0xFF000000) >> 24;
-	}
-
-	static inline __device__ unsigned int reduce(cnt_type4 * tu4, cnt_type4 * val)
-	{
-	    tu4->x = val[0].x + val[0].y + val[0].z + val[0].w;
-	    tu4->y = val[1].x + val[1].y + val[1].z + val[1].w;
-	    tu4->z = val[2].x + val[2].y + val[2].z + val[2].w;
-	    tu4->w = val[3].x + val[3].y + val[3].z + val[3].w;
-
-	    return tu4->x + tu4->y + tu4->z + tu4->w;
-	}
-
-	static inline __device__ void scan(unsigned int woff_w, cnt_type4 tu4, cnt_type4 * val)
-	{
-		uint4 lps;
-	    lps.x = woff_w + tu4.w;
-	    lps.y = lps.x + tu4.x;
-	    lps.z = lps.y + tu4.y;
-	    lps.w = lps.z + tu4.z;
-
-	    val[0].x = lps.x;
-	    val[1].x = lps.y;
-	    val[2].x = lps.z;
-	    val[3].x = lps.w;
-
-	    val[0].y += val[0].x;
-	    val[1].y += val[1].x;
-	    val[2].y += val[2].x;
-	    val[3].y += val[3].x;
-
-	    val[0].z += val[0].y;
-	    val[1].z += val[1].y;
-	    val[2].z += val[2].y;
-	    val[3].z += val[3].y;
-
-	    val[0].w += val[0].z;
-	    val[1].w += val[1].z;
-	    val[2].w += val[2].z;
-	    val[3].w += val[3].z;
-	}
-
-	static inline __device__ cnt_type4 zero()
-	{
-		return make4<cnt_type4>::make();
-	}
-};*/
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*template<typename cnt_type, typename ids_type>
-__global__ void compress4(const cnt_type n_cell, const cnt_type *const count, ids_type *const compressed)
-{
-    const int gid = 4*(threadIdx.x + blockDim.x * blockIdx.x);
-
-    if (gid >= n_cell)
-    {return;}
-
-    compressed[gid] = count[gid];
-    compressed[gid+1] = count[gid+1];
-    compressed[gid+2] = count[gid+2];
-    compressed[gid+3] = count[gid+3];
-}
-
-template <int NWARP, typename cnt_type, typename ids_type, typename reduction>
-__global__ void breduce(int n, const cnt_type *vin, cnt_type *vout)
-{
-    const int wid = threadIdx.x/32;
-    const int lid = threadIdx.x%32;
-    const int tid = 4*(blockDim.x*blockIdx.x + threadIdx.x);
-    cnt_type val[4] = {0,0,0,0};
-
-    __shared__ unsigned int shtmp[NWARP];
-
-    if (tid < n)
-    {
-    	val[0] = vin[0+tid];
-    	val[1] = vin[1+tid];
-    	val[2] = vin[2+tid];
-    	val[3] = vin[3+tid];
-    }
-
-    val[0] = reduction::reduction(val);
-
-#pragma unroll
-    for(int i = 16; i > 0; i >>= 1)
-    {val[0] += __shfl_down_sync(0xFFFFFFFF,(int)val[0],i);}
-
-    if (0 == lid)
-    {shtmp[wid] = val[0];}
-
-    __syncthreads();
-    if (0 == wid)
-    {
-        val[0] = (lid < NWARP) ? shtmp[lid] : 0;
-
-#pragma unroll
-        for(int i = 16; i > 0; i >>= 1)
-        {val[0] += __shfl_down_sync(0xFFFFFFFF,(int)val[0], i);}
-    }
-
-    if (0 == threadIdx.x)
-    {
-    	vout[blockIdx.x] = val[0];
-    }
-}
-
-template <int BDIM, typename cnt_type>
-__global__ void bexscan(int n, cnt_type *v)
-{
-    extern __shared__ unsigned int shtmp[];
-
-    for(int i = threadIdx.x; i < n; i += BDIM)
-    {shtmp[i] = v[i];}
-
-    int i = threadIdx.x;
-    __syncthreads();
-    for(; n >= BDIM; i += BDIM, n -= BDIM)
-    {
-        __syncthreads();
-        if (i > 0 && 0 == threadIdx.x)
-        {shtmp[i] += shtmp[i-1];}
-
-        unsigned int a = 0;
-
-#pragma unroll
-        for(int j = 1; j < BDIM; j <<= 1)
-        {
-            a = 0;
-
-            __syncthreads();
-            if (threadIdx.x >= j) {a = shtmp[i] + shtmp[i-j];}
-
-            __syncthreads();
-            if (threadIdx.x >= j) {shtmp[i] = a;}
-        }
-        v[i] = shtmp[i];
-    }
-    if (threadIdx.x < n)
-    {
-        __syncthreads();
-        if (i > 0 && 0 == threadIdx.x)
-        {shtmp[i] += shtmp[i-1];}
-
-        unsigned int a = 0;
-        for(int j = 1; j < BDIM; j <<= 1)
-        {
-            a = 0;
-
-            __syncthreads();
-            if (threadIdx.x >= j) a = shtmp[i] + shtmp[i-j];
-
-            __syncthreads();
-            if (threadIdx.x >= j) shtmp[i] = a;
-        }
-        v[i] = shtmp[i];
-    }
-}
-
-template <int NWARP, typename extend>
-__global__ void gexscan(int n,
-		                const typename extend::cnt_type4 *vin,
-		                typename extend::cnt_type_ *offs,
-						typename extend::cnt_type4 *vout)
-{
-    const int wid = threadIdx.x/32;
-    const int lid = threadIdx.x%32;
-    const int tid = blockDim.x*blockIdx.x + threadIdx.x;
-    typename extend::cnt_type4 val[extend::size::value];
-
-    __shared__ unsigned int woff[NWARP];
-
-    if (tid < n) val[0] = vin[tid];
-    else         val[0] = extend::zero();
-
-    extend::extend(val);
-
-    typename extend::cnt_type4 tu4;
-    typename extend::cnt_type_ tmp = extend::reduce(&tu4,val);
-
-    tu4.w = tmp;
-#pragma unroll
-    for(int i = 1; i < 32; i <<= 1)
-    {tu4.w += (lid >= i)*__shfl_up_sync(0xFFFFFFFF,(int)tu4.w, i);}
-
-    if (lid == 31)
-    {
-        if (wid < NWARP-1) woff[wid+1] = tu4.w;
-        else               woff[0]     = (blockIdx.x > 0) ? offs[blockIdx.x-1] : 0;
-    }
-
-    tu4.w -= tmp;
-
-    __syncthreads();
-
-    if (0 == wid)
-    {
-        tmp = (lid < NWARP) ? woff[lid] : 0;
-#pragma unroll
-        for(int i = 1; i < NWARP; i <<= 1)
-        {tmp += (lid >= i)*__shfl_up_sync(0xFFFFFFFF,(int)tmp, i);}
-
-        if (lid < NWARP) woff[lid] = tmp;
-    }
-    __syncthreads();
-
-    if (tid >= n) return;
-
-    extend::scan(woff[wid],tu4,val);
-
-    vout += tid*extend::size::value;
-
-#pragma unroll
-    for(int i = 0; i < extend::size::value; i++)
-    {vout[i] = val[i];}
-}*/
-
-/*! \brief Scan is a class because it use internally temporary buffers that are heavy to reconstruct
- *
- *
- *
- */
-/*template<typename cnt_type, typename ids_type>
-class scan
-{
-	openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> red;
-	openfpm::vector<aggregate<ids_type>,CudaMemory,typename memory_traits_inte<aggregate<ids_type>>::type,memory_traits_inte> compressed;
-
-public:
-
-	void scan_(openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> & cl_n,
-			  openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> & cl_n_scan)
-	{
-		constexpr int THREADS = 128;
-		constexpr int ratio = 4*sizeof(cnt_type)/sizeof(ids_type);
-
-		int nblocks = (cl_n.size() + THREADS * ratio - 1 ) / (THREADS * ratio);
-		red.resize(nblocks);
-
-		auto ite = cl_n.getGPUIterator();
-
-		compressed.resize(cl_n.size());
-		compress4<cnt_type,ids_type><<<ite.wthr,ite.thr>>>((cnt_type)cl_n.size(),
-														   static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-														   static_cast<ids_type *>(compressed.template getDeviceBuffer<0>()));
-
-		breduce<THREADS/32,cnt_type,ids_type,ratio_reduction<cnt_type,ids_type>><<<nblocks, THREADS                >>>(cl_n.size() / ratio * 4,
-																  (cnt_type *)compressed.template getDeviceBuffer<0>(),
-																  static_cast<cnt_type *>(red.template getDeviceBuffer<0>()));
-
-
-		bexscan<THREADS,cnt_type><<<1, THREADS, nblocks*sizeof(uint)>>>(nblocks,
-															   static_cast<cnt_type *>(red.template getDeviceBuffer<0>()));
-
-		size_t raw_size = cl_n.size();
-
-		// resize to be multiple of 16
-
-		size_t ml = ((raw_size + ratio - 1) / ratio) *ratio;
-		cl_n_scan.resize(ml);
-
-		gexscan<THREADS/32,ratio_extend<cnt_type,ids_type>> <<< nblocks, THREADS >>>((cl_n.size() + ratio - 1 ) / ratio,
-																							  static_cast<typename ratio_extend<cnt_type,ids_type>::cnt_type4 *>(compressed.template getDeviceBuffer<0>()),
-																							  static_cast<cnt_type *>(red.template getDeviceBuffer<0>()),
-																							  static_cast<typename ratio_extend<cnt_type,ids_type>::cnt_type4 *>(cl_n_scan.template getDeviceBuffer<0>()));
-
-		cl_n_scan.resize(raw_size);
-	}
-
-};*/
-
-#else
-
-// In case we do not have NVCC we create a stub
-
-template<typename cnt_type, typename ids_type>
-class scan
-{
-};
-
-#endif
-
-#endif /* SCAN_CUDA_CUH_ */
diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh
index 7d778b2a1d6e3415383622e3c5ce3e3149c02e7e..2ef79fdb024018ed2ea5809044fe65f0087590c0 100644
--- a/src/util/cuda/scan_ofp.cuh
+++ b/src/util/cuda/scan_ofp.cuh
@@ -15,7 +15,11 @@
 #if CUDART_VERSION >= 11000
 	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#include "cub/cub.cuh"
+	#ifdef __HIP__
+		#include "hipcub/hipcub.hpp"
+	#else
+		#include "cub/cub.cuh"
+	#endif
 	#ifndef SCAN_WITH_CUB
 		#define SCAN_WITH_CUB
 	#endif
@@ -25,6 +29,7 @@
 	#include "cub_old/cub.cuh"
 	#include "util/cuda/moderngpu/kernel_scan.hxx"
 #endif
+
 #include "util/cuda/ofp_context.hxx"
 
 namespace openfpm
@@ -36,32 +41,53 @@ namespace openfpm
 
 	if (count == 0)	{return;}
 
-    typename std::remove_reference<decltype(input[0])>::type scan_a = 0;
-	// No particular speed-up from omp amd simd
-//    #pragma omp parallel for simd reduction(inscan, +:scan_a)
-    for(int i = 0; i < count; i++)
+	auto prec = input[0];
+	output[0] = 0;
+	for (int i = 1 ; i < count ; i++)
 	{
-        output[i] = scan_a;
-//        #pragma omp scan exclusive(scan_a)
-        scan_a += input[i];
-    }
+		auto next = prec + output[i-1];
+		prec = input[i];
+		output[i] = next;
+	}
 
 #else
 	#ifdef SCAN_WITH_CUB
 
-			void *d_temp_storage = NULL;
-			size_t temp_storage_bytes = 0;
-			cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
-																		output,
-																		count);
+			#ifdef __HIP__
+
+				if (count == 0)	{return;}
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+				hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
+																			output,
+																			count);
+
+				auto & temporal = context.getTemporalCUB();
+				temporal.resize(temp_storage_bytes);
+
+				// Run
+				hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
+						output,
+						count);
+
+			#else
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+				cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
+																			output,
+																			count);
+
+				auto & temporal = context.getTemporalCUB();
+				temporal.resize(temp_storage_bytes);
 
-			auto & temporal = context.getTemporalCUB();
-			temporal.resize(temp_storage_bytes);
+				// Run
+				cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
+						output,
+						count);
 
-			// Run
-			cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
-					output,
-					count);
+			#endif
 
 	#else
 			mgpu::scan(input,count,output,context);
diff --git a/src/util/cuda/scan_sort_cuda_unit_tests.cu b/src/util/cuda/scan_sort_cuda_unit_tests.cu
index 65c4c343a6afacb22a0de4b3efbdd3c641213e7c..7314e87485d9665532fe5ee1cb642aa0905fd922 100644
--- a/src/util/cuda/scan_sort_cuda_unit_tests.cu
+++ b/src/util/cuda/scan_sort_cuda_unit_tests.cu
@@ -7,12 +7,12 @@
 
 #include "util/cuda_util.hpp"
 #include "Vector/map_vector.hpp"
-#include "scan_cuda.cuh"
 
 #define SORT_WITH_CUB
 
 #include "sort_ofp.cuh"
 #include "scan_ofp.cuh"
+#include "segreduce_ofp.cuh"
 
 BOOST_AUTO_TEST_SUITE( scan_tests )
 
@@ -109,4 +109,81 @@ BOOST_AUTO_TEST_CASE( test_sort_cub_wrapper )
 	// Test the cell list
 }
 
+BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper )
+{
+	std::cout << "Test gpu segmented reduce" << "\n";
+
+	mgpu::ofp_context_t context;
+
+	int count = 130;
+
+	openfpm::vector_gpu<aggregate<int>> vgpu;
+	openfpm::vector_gpu<aggregate<int>> segment_offset;
+	openfpm::vector_gpu<aggregate<int>> output;
+	int init = 0;
+
+	vgpu.resize(count);
+
+	for (size_t i = 0 ; i < count ; i++)
+	{
+		vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17;
+	}
+
+	segment_offset.add();
+	segment_offset.template get<0>(0) = 0;
+	size_t base = 0;
+	while (1)
+	{
+		int c = ((float)rand() / (float)RAND_MAX) * 17;
+
+		if (c + base >= count)
+		{break;}
+
+		segment_offset.add();
+		segment_offset.template get<0>(segment_offset.size() - 1) = c + segment_offset.template get<0>(segment_offset.size() - 2);
+
+		base += c;
+	}
+
+	vgpu.hostToDevice<0>();
+
+	segment_offset.hostToDevice<0>();
+	output.resize(segment_offset.size());
+
+	openfpm::segreduce((int *)vgpu.template getDeviceBuffer<0>(), vgpu.size(),
+					(int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size(),
+					(int *)output.template getDeviceBuffer<0>(),
+					mgpu::plus_t<int>(), init, context);
+
+
+	output.template deviceToHost<0>();
+
+	bool match = true;
+	size_t i = 0;
+	for ( ; i < segment_offset.size()-1 ; i++)
+	{
+		size_t red = 0;
+		for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i)  ; j++)
+		{
+			red += vgpu.template get<0>(segment_offset.template get<0>(i) + j);
+		}
+		match &= red == output.template get<0>(i);
+	}
+
+	BOOST_REQUIRE_EQUAL(match,true);
+
+	size_t red2 = 0;
+	for (size_t j = 0 ; j < vgpu.size() - segment_offset.template get<0>(i)  ; j++)
+	{
+		red2 += vgpu.template get<0>(segment_offset.template get<0>(i) + j);
+	}
+	match &= red2 == output.template get<0>(i);
+
+	BOOST_REQUIRE_EQUAL(match,true);
+
+	std::cout << "End test modern gpu test reduce" << "\n";
+
+	// Test the cell list
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/util/cuda/segreduce_block_cuda.cuh b/src/util/cuda/segreduce_block_cuda.cuh
deleted file mode 100644
index 9a1b29c585d933367fd01ae3b4a87f72434a48cf..0000000000000000000000000000000000000000
--- a/src/util/cuda/segreduce_block_cuda.cuh
+++ /dev/null
@@ -1,10 +0,0 @@
-//
-// Created by tommaso on 20/05/19.
-//
-
-#ifndef OPENFPM_PDATA_SEGREDUCE_BLOCK_CUDA_CUH
-#define OPENFPM_PDATA_SEGREDUCE_BLOCK_CUDA_CUH
-
-
-
-#endif //OPENFPM_PDATA_SEGREDUCE_BLOCK_CUDA_CUH
diff --git a/src/util/cuda/segreduce_ofp.cuh b/src/util/cuda/segreduce_ofp.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9a0c3764108e2e0e009f3d2e2a62fb470b7a20df
--- /dev/null
+++ b/src/util/cuda/segreduce_ofp.cuh
@@ -0,0 +1,157 @@
+/*
+ * segreduce_ofp.hpp
+ *
+ *  Created on: May 15, 2019
+ *      Author: i-bird
+ */
+
+ #ifndef SEGREDUCE_OFP_HPP_
+ #define SEGREDUCE_OFP_HPP_
+ 
+ #ifdef __NVCC__
+ 
+ #include "Vector/map_vector.hpp"
+ #include "util/cuda_launch.hpp"
+ #include "util/cuda/segreduce_ofp.cuh"
+ 
+ #if CUDART_VERSION >= 11000
+     #ifndef CUDA_ON_CPU 
+     // Here we have for sure CUDA >= 11
+     #ifdef __HIP__
+        #undef __CUDACC__
+        #undef __CUDA__
+        #include <thrust/reduce.h>
+        #define __CUDACC__
+        #define __CUDA__
+     #else
+        #include "util/cuda/moderngpu/kernel_segreduce.hxx"
+     #endif
+     #endif
+ #else
+    #include "util/cuda/moderngpu/kernel_segreduce.hxx"
+ #endif
+ #include "util/cuda/ofp_context.hxx"
+ 
+template<typename segments_it, typename keys_type, typename output_it, typename seg_type, typename type_t>
+__global__ void seg_to_keys(segments_it segs, keys_type keys, seg_type seg_out ,output_it output, int n_count, int num_segments,type_t init)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid >= num_segments)    {return;}
+
+    int s = segs[tid];
+    int s_p1 = (tid == num_segments -1)?n_count:segs[tid+1];
+
+    int n_ele = s_p1 - s;
+
+    seg_out.template get<1>(tid) = (s != s_p1);
+    output[tid] = init;
+
+    for (int j = 0 ; j < n_ele ; j++)
+    {
+        keys.template get<0>(s + j) = tid;
+    }
+}
+
+template<typename output_it, typename out_tmp_type ,typename segs_type>
+__global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type segs, int num_segments)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid >= num_segments)    {return;}
+
+    int t = segs.template get<2>(tid);
+    int to_copy = segs.template get<1>(tid);
+
+    auto op = out_tmp.template get<0>(t);
+
+    if (to_copy == 1)
+    {out[tid] = op;}
+}
+
+ namespace openfpm
+ {
+    template<typename input_it,
+             typename segments_it, typename output_it, typename op_t, typename type_t>
+    void segreduce(input_it input, int count, segments_it segments,
+                    int num_segments, output_it output, op_t op, type_t init,
+                    mgpu::ofp_context_t & context)
+     {
+ #ifdef CUDA_ON_CPU
+ 
+        int i = 0;
+        for ( ; i < num_segments - 1; i++)
+        {
+            int j = segments[i];
+            output[i] = init;
+            if (j == segments[i+1]) {continue;}
+            output[i] = input[j];
+            ++j;
+            for ( ; j < segments[i+1] ; j++)
+            {
+                output[i] = op(output[i],input[j]);
+            }
+        }
+
+        // Last segment
+        int j = segments[i];
+        if (j != count)
+        {
+            output[i] = input[j];
+            ++j;
+            for ( ; j < count ; j++)
+            {
+                output[i] = op(output[i],input[j]);
+            }
+        }
+ 
+ #else
+
+        #ifdef __HIP__
+
+            typedef typename std::remove_pointer<segments_it>::type index_type;
+            typedef typename std::remove_pointer<output_it>::type out_type;
+
+            openfpm::vector_gpu<aggregate<index_type>> keys;
+            keys.resize(count);
+
+            openfpm::vector_gpu<aggregate<index_type,index_type,index_type>> segs_out;
+            segs_out.resize(num_segments);
+
+            openfpm::vector_gpu<aggregate<out_type>> out_tmp;
+            out_tmp.resize(num_segments);
+
+            grid_sm<1,void> g(num_segments);
+
+            auto it = g.getGPUIterator();
+
+            CUDA_LAUNCH(seg_to_keys,it,segments,keys.toKernel(),segs_out.toKernel(),output,count,num_segments,init);
+
+            openfpm::scan((index_type *)segs_out.template getDeviceBuffer<1>(),num_segments,(index_type *)segs_out.template getDeviceBuffer<2>(),context);
+
+            thrust::pair<index_type *,out_type *> new_end;
+            new_end = thrust::reduce_by_key(thrust::device, (segments_it)keys.template getDeviceBuffer<0>(),((segments_it)keys.template getDeviceBuffer<0>()) + count, 
+                                            input, 
+                                            (segments_it)segs_out.template getDeviceBuffer<0>(), 
+                                            (output_it)out_tmp.template getDeviceBuffer<0>(),
+                                            thrust::equal_to<int>(),
+                                            op);
+
+            // .. Not so easy to emulate a segmented reduce we have to track the zeros segments and realign the output
+
+            CUDA_LAUNCH(realign_output,it,output,out_tmp.toKernel(),segs_out.toKernel(),num_segments);
+
+        #else
+
+            mgpu::segreduce(input,count,segments,num_segments,output,op,init,context);
+
+        #endif
+
+ #endif
+     }
+ }
+ 
+ #endif /* __NVCC__ */
+ 
+ #endif /* SCAN_OFP_HPP_ */
+ 
\ No newline at end of file
diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh
index 8a83b80a2eaf0b962472402414f0254c6d7ffe8c..e7939c60cf6f9d2372b728edda2708599e6fbc98 100644
--- a/src/util/cuda/sort_ofp.cuh
+++ b/src/util/cuda/sort_ofp.cuh
@@ -16,7 +16,11 @@
 #if CUDART_VERSION >= 11000
 	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#include "cub/cub.cuh"
+	#ifdef __HIP__
+		#include "hipcub/hipcub.hpp"
+	#else
+		#include "cub/cub.cuh"
+	#endif
 	#ifndef SORT_WITH_CUB
 		#define SORT_WITH_CUB
 	#endif
@@ -125,6 +129,14 @@ struct key_val_it
 	key_t * key;
 	val_t * val;
 
+
+	key_val_it & operator+=(int delta)
+	{
+		key += delta;
+		val += delta;
+		return *this;
+	}
+
 	bool operator==(const key_val_it & tmp)
 	{
 		return (key == tmp.key && val == tmp.val);
@@ -193,7 +205,17 @@ struct key_val_it
 		return key < tmp.key;
 	}
 
-	key_val_it<key_t,val_t> & operator=(const key_val_it<key_t,val_t> & tmp)
+	bool operator>(const key_val_it & tmp) const
+	{
+		return key > tmp.key;
+	}
+
+	bool operator>=(const key_val_it & tmp) const
+	{
+		return key >= tmp.key;
+	}
+
+	key_val_it<key_t,val_t> & operator=(key_val_it<key_t,val_t> & tmp)
 	{
 		key = tmp.key;
 		val = tmp.val;
@@ -255,66 +277,134 @@ namespace openfpm
 
 	#ifdef SORT_WITH_CUB
 
-			void *d_temp_storage = NULL;
-			size_t temp_storage_bytes = 0;
-
-			auto & temporal2 = context.getTemporalCUB2();
-			temporal2.resize(sizeof(key_t)*count);
-
-			auto & temporal3 = context.getTemporalCUB3();
-			temporal3.resize(sizeof(val_t)*count);
-
-			if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
-			{
-				cub::DeviceRadixSort::SortPairs(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-
-				auto & temporal = context.getTemporalCUB();
-				temporal.resize(temp_storage_bytes);
-
-				d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-				// Run
-				cub::DeviceRadixSort::SortPairs(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-			}
-			else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
-			{
-				cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-
-				auto & temporal = context.getTemporalCUB();
-				temporal.resize(temp_storage_bytes);
-
-				d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-				// Run
-				cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-			}
-
-			cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
-			cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+			#ifdef __HIP__
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+
+				auto & temporal2 = context.getTemporalCUB2();
+				temporal2.resize(sizeof(key_t)*count);
+
+				auto & temporal3 = context.getTemporalCUB3();
+				temporal3.resize(sizeof(val_t)*count);
+
+				if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
+				{
+					hipcub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					hipcub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+				else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
+				{
+					hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+
+				cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
+				cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+			
+
+			#else
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+
+				auto & temporal2 = context.getTemporalCUB2();
+				temporal2.resize(sizeof(key_t)*count);
+
+				auto & temporal3 = context.getTemporalCUB3();
+				temporal3.resize(sizeof(val_t)*count);
+
+				if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
+				{
+					cub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					cub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+				else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
+				{
+					cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+
+				cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
+				cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+				
+			#endif
 
 	#else
 			mgpu::mergesort(keys_input,vals_input,count,comp,context);
diff --git a/src/util/mathutil.hpp b/src/util/mathutil.hpp
index f1adaa8877e4025fe6e3bccb62deb90d4116fc75..1b05785a90aeac2155b2bae58745656c1a23daf4 100644
--- a/src/util/mathutil.hpp
+++ b/src/util/mathutil.hpp
@@ -108,6 +108,24 @@ namespace openfpm
 			return (exponent == 0) ? 1 : (base * pow(base, exponent-1));
 		}
 
+        template<class T>
+        double intpowlog(const T x, unsigned const e)
+        {
+            if (e == 0) return 1.0;
+            if (e % 2 == 0)
+            {
+                double h = intpowlog(x, e / 2);
+                return h * h;
+            }
+            else
+            {
+                double h = intpowlog(x, e / 2);
+                return h * h * x;
+            }
+        }
+
+
+
 		/* \brief Return the positive modulo of a number
 		 *
 		 * # Example
diff --git a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp
index 744efeacd19e7df2518ec103884e3d1552e6656c..2f412bb04aab1ddc448f47a1bfa90bcf97eb36cc 100644
--- a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp
+++ b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp
@@ -119,7 +119,7 @@ public:
   inline __host__ __device__ size_type size() const { return boost::mpl::at<vector,boost::mpl::int_<0>>::type::value; }
   size_type max_size() const { return num_elements(); }
   bool empty() const { return size() == 0; }
-  size_type num_dimensions() const { return NumDims; }
+  inline __device__ __host__ size_type num_dimensions() const { return NumDims; }
   inline __host__ __device__ const index* strides() const { return strides_; }
 
   size_type num_elements() const
@@ -214,6 +214,7 @@ public:
     return *this;
   }
 
+  __device__ __host__ T* origin_mutable() const { return const_cast<T*>(this->base_); }
   __device__ __host__ T* origin() { return this->base_; }
   __device__ __host__ const T* origin() const { return this->base_; }
 
diff --git a/src/util/object_creator.hpp b/src/util/object_creator.hpp
index aafc92b9f13cf37576be9421b0f4825cd52bdace..6ecad2b5270be83b341d31849937f6afe085c7f7 100644
--- a/src/util/object_creator.hpp
+++ b/src/util/object_creator.hpp
@@ -300,6 +300,26 @@ struct object_creator_chunking_impl<v,vc,p1,prp...>
 	typedef typename boost::mpl::push_front<vc_step, ele >::type type;
 };
 
+template <typename T>
+struct object_creator_chunking_encapsulator
+{
+	typedef T type;
+};
+
+template <typename type>
+struct get_base_type_for_object_creator_chunking
+{
+	typedef object_creator_chunking_encapsulator<typename type::value_type> ele;
+};
+
+
+template <unsigned int N1, std::size_t N2, typename T>
+struct get_base_type_for_object_creator_chunking< std::array<T, N2>[N1] >
+{
+	typedef object_creator_chunking_encapsulator<T[N1]> ele;
+};
+
+
 /*! \brief Implementation of object creator
  *
  * \tparam v original boost::fusion::vector
@@ -312,10 +332,10 @@ struct object_creator_chunking_impl<v,vc,prp>
 	typedef typename boost::remove_reference< typename boost::mpl::at< v,boost::mpl::int_<prp> >::type>::type ele_array;
 
 	// Element without array
-	typedef typename ele_array::value_type ele;
+	typedef typename get_base_type_for_object_creator_chunking<ele_array>::ele ele;
 
 	// push on the vector the element p1
-	typedef typename boost::mpl::push_front<vc, ele >::type type;
+	typedef typename boost::mpl::push_front<vc, typename ele::type >::type type;
 };
 
 
diff --git a/src/util/object_s_di.hpp b/src/util/object_s_di.hpp
index 905449e7a94d11f82ced620e71fec75b4db599c6..b26d391b1e0b331c0f551465f337eb16cd79e763 100644
--- a/src/util/object_s_di.hpp
+++ b/src/util/object_s_di.hpp
@@ -15,7 +15,28 @@
 #include <boost/mpl/range_c.hpp>
 #include <boost/fusion/include/size.hpp>
 
-template <typename> struct Debug;
+template<typename T>
+struct object_s_di_e_cnk_meta_copy_selector
+{
+	template<unsigned int T_value, typename v_prp, typename copy_rtype, typename Tsrc, typename Tdst>
+	static inline void copy(const Tsrc & src, Tdst & dst, int sub_id)
+	{
+		meta_copy<typename copy_rtype::value_type>::meta_copy_(src.template get<T_value>(),dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[sub_id]);
+	}
+};
+
+template<unsigned int N1, typename T>
+struct object_s_di_e_cnk_meta_copy_selector<T[N1]>
+{
+	template<unsigned int T_value, typename v_prp, typename copy_rtype, typename Tsrc, typename Tdst>
+	static inline void copy(const Tsrc & src, Tdst & dst, int sub_id)
+	{
+		for (int i = 0 ; i < N1 ; i++)
+		{
+			meta_copy<typename T::value_type>::meta_copy_(src.template get<T_value>()[i],dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[i][sub_id]);
+		}
+	}
+};
 
 /*! \brief this class is a functor for "for_each" algorithm
  *
@@ -60,9 +81,11 @@ struct object_s_di_e_cnk
     void operator()(T& t)
     {
 		// Remove the reference from the type to copy
-		typedef typename boost::remove_reference<decltype(dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>())>::type::value_type copy_rtype;
+		typedef typename boost::remove_reference<decltype(dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>())>::type copy_rtype;
+
+    	//meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>(),dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>()[sub_id]);
 
-    	meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>(),dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>()[sub_id]);
+		object_s_di_e_cnk_meta_copy_selector<copy_rtype>::template copy<T::value,v_prp,copy_rtype>(src,dst,sub_id);
     }
 };
 
diff --git a/src/util/object_si_d.hpp b/src/util/object_si_d.hpp
index 17a4034b903034f3d3c417d6edd223601b880d4a..78490647478a3c1832e4b9580a0ad7148e027213 100644
--- a/src/util/object_si_d.hpp
+++ b/src/util/object_si_d.hpp
@@ -59,6 +59,29 @@ struct object_si_d_e
     }
 };
 
+template<typename T>
+struct object_si_d_e_cnk_meta_copy_selector
+{
+	template<unsigned int T_value, typename v_prp, typename ctype, typename Tsrc, typename Tdst>
+	static inline void copy(const Tsrc & src, Tdst & dst, int sub_id)
+	{
+		meta_copy<ctype>::meta_copy_(src.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[sub_id],dst.template get<T_value>());
+	}
+};
+
+template<unsigned int N1, typename T>
+struct object_si_d_e_cnk_meta_copy_selector<T[N1]>
+{
+	template<unsigned int T_value, typename v_prp, typename ctype, typename Tsrc, typename Tdst>
+	static inline void copy(const Tsrc & src, Tdst & dst, int sub_id)
+	{
+		for (int i = 0 ; i < N1 ; i++)
+		{
+			meta_copy<T>::meta_copy_(src.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[i][sub_id],dst.template get<T_value>()[i]);
+		}
+	}
+};
+
 /*! \brief this class is a functor for "for_each" algorithm
  *
  * This class is a functor for "for_each" algorithm. For each
@@ -102,7 +125,7 @@ struct object_si_d_e_cnk
     {
     	typedef typename boost::mpl::at<typename v_dst::type,typename boost::mpl::int_<T::value>>::type ctype;
 
-    	meta_copy<ctype>::meta_copy_(src.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>()[sub_id],dst.template get<T::value>());
+		object_si_d_e_cnk_meta_copy_selector<ctype>::template copy<T::value,v_prp,ctype>(src,dst,sub_id);
     }
 };
 
diff --git a/src/util/performance/performance_util.hpp b/src/util/performance/performance_util.hpp
index 4468ee45e7ad4208c169bfd2f44d4dfb7858f693..f90ec5c3d8f20f9218b0f5d32f760c24a37959b9 100644
--- a/src/util/performance/performance_util.hpp
+++ b/src/util/performance/performance_util.hpp
@@ -12,8 +12,9 @@
 #include <boost/filesystem.hpp>
 #include <boost/property_tree/ptree.hpp>
 #include <boost/property_tree/xml_parser.hpp>
+#include <fstream>
 
-static void addUpdtateTime(GoogleChart & cg, int np)
+static void addUpdateTime(GoogleChart & cg, int np, const std::string & base, const std::string & filename)
 {
     time_t t = time(0);   // get time now
     struct tm * now = localtime( & t );
@@ -21,12 +22,36 @@ static void addUpdtateTime(GoogleChart & cg, int np)
     std::stringstream str;
 
     std::string commit;
-    commit = exec("git rev-parse HEAD");
+
+    std::system("git rev-parse HEAD >test.txt"); // execute the UNIX command "ls -l >test.txt"
+    std::ifstream("test.txt") >> commit;
+
+    std::string prev_commit;
+
+    std::system(std::string("cat commit_f_" + base + " > test.txt").c_str()); // execute the UNIX command "ls -l >test.txt"
+    std::ifstream("test.txt") >> prev_commit;
 
     str << "<h3>Updated: " << now->tm_mday << "/" << now->tm_mon + 1 << "/" << now->tm_year+1900 << "     " << now->tm_hour << ":" << now->tm_min << ":"
-    		               << now->tm_sec << "  commit: " << commit << "   run with: " << np << " processes" << std::endl;
+                               << now->tm_sec << "  commit: " << commit << "   run with: " << np << " processes<br>previous: <a href=\"" << filename << "_" << prev_commit << ".html\">here</a>" << "</h3>" << std::endl;
+
+    cg.addHTML(str.str());
+}
+
+static void createCommitFile(const std::string & tmp)
+{
+    time_t t = time(0);   // get time now
+    struct tm * now = localtime( & t );
+
+    std::stringstream str;
+
+    std::string commit;
+    commit = exec("git rev-parse HEAD");
+	std::cout << tmp << " " << commit << std::endl;
+
+	std::ofstream f("commit_f_" + tmp);
+	f << commit << std::endl;
 
-	cg.addHTML(str.str());
+	f.close();
 }
 
 static inline void warning_set(int & warning_level, double mean, double mean_ref, double sigma)
diff --git a/src/util/test/util_test.hpp b/src/util/test/util_test.hpp
index 5960f6c5d945d11e1f777789af264714c43f1a74..842d24f234af58fbece45168a4f70e0da160eea5 100644
--- a/src/util/test/util_test.hpp
+++ b/src/util/test/util_test.hpp
@@ -586,9 +586,9 @@ BOOST_AUTO_TEST_CASE( check_templates_util_function )
 		}
 
 		{
-		//! [Check has_value_type]
+		//! [Check has_value_type_ofp]
 
-		struct test_has_value_type
+		struct test_has_value_type_ofp
 		{
 			typedef int value_type;
 		};
@@ -598,12 +598,12 @@ BOOST_AUTO_TEST_CASE( check_templates_util_function )
 
 		};
 
-		int val = has_value_type_ofp<test_has_value_type>::value;
+		int val = has_value_type_ofp<test_has_value_type_ofp>::value;
 		BOOST_REQUIRE_EQUAL(val, true);
 		val = has_value_type_ofp<test_has_no_value_type>::value;
 		BOOST_REQUIRE_EQUAL(val, false);
 
-		//! [Check has_value_type]
+		//! [Check has_value_type_ofp]
 		}