From 5edbb9578d6de07f5435fe6c4efaf283dfd719dc Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Thu, 30 Aug 2018 18:10:14 +0200
Subject: [PATCH] Adding GPU_branch_start

---
 CMakeFiles/FindEigen3.cmake                   |  108 ++
 CMakeFiles/FindLibHilbert.cmake               |   99 ++
 CMakeFiles/FindMetis.cmake                    |  187 +++
 CMakeFiles/FindPETSc.cmake                    |  212 +++
 CMakeFiles/FindParMetis.cmake                 |  164 +++
 .../Vortex_in_cell/main_vic_petsc_double.cpp  | 1287 +++++++++++++++++
 example/Vector/7_SPH_dlb_opt/main_dbg.cpp     | 1254 ++++++++++++++++
 openfpm_data                                  |    2 +-
 script/install_EIGEN.sh                       |   15 +-
 script/install_OPENBLAS.sh                    |    4 +-
 script/remove_old                             |    9 +-
 src/Decomposition/CartDecomposition.hpp       |   50 +-
 .../cuda/CartDecomposition_gpu.cuh            |   27 +-
 .../cuda/decomposition_cuda_tests.cu          |   66 -
 src/Decomposition/cuda/ie_ghost_gpu.cuh       |  126 ++
 src/Decomposition/ie_ghost.hpp                |  131 +-
 src/Makefile.am                               |    2 +-
 src/Vector/cuda/vector_dist_cuda_func_test.cu |  189 +++
 src/Vector/cuda/vector_dist_cuda_funcs.cuh    |   41 +
 src/Vector/cuda/vector_dist_gpu_unit_tests.cu |    7 +-
 src/Vector/vector_dist.hpp                    |    4 +-
 src/Vector/vector_dist_comm.hpp               |  123 +-
 src/Vector/vector_dist_kernel.hpp             |  114 ++
 src/config/config_cmake.h.in                  |  159 ++
 src/main_single.cpp                           |   32 +
 25 files changed, 4213 insertions(+), 199 deletions(-)
 create mode 100644 CMakeFiles/FindEigen3.cmake
 create mode 100644 CMakeFiles/FindLibHilbert.cmake
 create mode 100644 CMakeFiles/FindMetis.cmake
 create mode 100644 CMakeFiles/FindPETSc.cmake
 create mode 100644 CMakeFiles/FindParMetis.cmake
 create mode 100644 example/Numerics/Vortex_in_cell/main_vic_petsc_double.cpp
 create mode 100644 example/Vector/7_SPH_dlb_opt/main_dbg.cpp
 create mode 100644 src/Decomposition/cuda/ie_ghost_gpu.cuh
 create mode 100644 src/Vector/vector_dist_kernel.hpp
 create mode 100644 src/config/config_cmake.h.in
 create mode 100644 src/main_single.cpp

diff --git a/CMakeFiles/FindEigen3.cmake b/CMakeFiles/FindEigen3.cmake
new file mode 100644
index 00000000..a2857c10
--- /dev/null
+++ b/CMakeFiles/FindEigen3.cmake
@@ -0,0 +1,108 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+#
+# and the following imported target:
+#
+#  Eigen3::Eigen - The header-only Eigen library
+#
+# This module reads hints about search locations from 
+# the following environment variables:
+#
+# EIGEN3_ROOT
+# EIGEN3_ROOT_DIR
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif(NOT Eigen3_FIND_VERSION_PATCH)
+
+  set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif(NOT Eigen3_FIND_VERSION)
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK TRUE)
+  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif(NOT EIGEN3_VERSION_OK)
+endmacro(_eigen3_check_version)
+
+if (EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+  set(Eigen3_FOUND ${EIGEN3_VERSION_OK})
+
+else (EIGEN3_INCLUDE_DIR)
+  
+  # search first if an Eigen3Config.cmake is available in the system,
+  # if successful this would set EIGEN3_INCLUDE_DIR and the rest of
+  # the script will work as usual
+  find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE QUIET)
+
+  if(NOT EIGEN3_INCLUDE_DIR)
+    find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+        HINTS
+        ENV EIGEN3_ROOT 
+        ENV EIGEN3_ROOT_DIR
+        PATHS
+        ${CMAKE_INSTALL_PREFIX}/include
+        ${KDE4_INCLUDE_DIR}
+        PATH_SUFFIXES eigen3 eigen
+      )
+  endif(NOT EIGEN3_INCLUDE_DIR)
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif(EIGEN3_INCLUDE_DIR)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif(EIGEN3_INCLUDE_DIR)
+
+if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen)
+  add_library(Eigen3::Eigen INTERFACE IMPORTED)
+  set_target_properties(Eigen3::Eigen PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}")
+endif()
+
diff --git a/CMakeFiles/FindLibHilbert.cmake b/CMakeFiles/FindLibHilbert.cmake
new file mode 100644
index 00000000..9b6f2b35
--- /dev/null
+++ b/CMakeFiles/FindLibHilbert.cmake
@@ -0,0 +1,99 @@
+# - Try to find LibHilbert
+# Once done this will define
+#
+#  LIBHILBERT_FOUND        - system has LibHilbert
+#  LIBHILBERT_INCLUDE_DIRS - include directories for PETSc
+#  LIBHILBERT_LIBRARY_DIRS - library directories for PETSc
+#  LIBHILBERT_LIBRARIES    - libraries for PETSc
+#
+#=============================================================================
+# Copyright (C) 2010-2016 Pietro Incardona
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+
+if (LIBHILBERT_FOUND)
+	return()
+endif()
+
+add_library(libhilbert INTERFACE IMPORTED)
+
+# Add libraries (static)
+set(_libs "-L${LIBHILBERT_ROOT}/lib -llibhilbert")
+set_property(TARGET libhilbert PROPERTY INTERFACE_LINK_LIBRARIES "${_libs}")
+
+
+# Create LibHilbert test program
+set(LIBHILBERT_TEST_LIB_CPP
+    "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/libhilbert_test_lib.cpp")
+
+
+file(WRITE ${LIBHILBERT_TEST_LIB_CPP} "
+extern \"C\"
+{
+#include \"hilbertKey.h\"
+}
+  int main()
+{
+  //An integer to handle errors
+  int err;
+
+  //Array to handle output
+  uint64_t nextCoord[2];
+
+  //Get the coordinates of the next cell
+  getIntCoordFromHKey(nextCoord, 4, 2, 0, &err);
+
+  return 0;
+}
+")
+
+# Try to run test program (static linking)
+try_run(
+	LIBHILBERT_TEST_LIB_EXITCODE
+	LIBHILBERT_TEST_LIB_COMPILED
+      	${CMAKE_CURRENT_BINARY_DIR}
+        ${LIBHILBERT_TEST_LIB_CPP}
+      CMAKE_FLAGS
+      "-DINCLUDE_DIRECTORIES:STRING=${LIBHILBERT_ROOT}/include"
+      "-DLINK_LIBRARIES:STRING=${LIBHILBERT_ROOT}/lib"
+      LINK_LIBRARIES libhilbert
+      COMPILE_OUTPUT_VARIABLE LIBHILBERT_TEST_LIB_COMPILE_OUTPUT
+      RUN_OUTPUT_VARIABLE LIBHILBERT_TEST_LIB_OUTPUT)
+
+if (LIBHILBERT_TEST_LIB_COMPILED AND LIBHILBERT_TEST_LIB_EXITCODE EQUAL 0)
+	    message(STATUS "Test LibHilbert_TEST_RUNS static linking - Success")
+	    set(LIBHILBERT_TEST_RUNS TRUE)
+	    set(LIBHILBERT_FOUND TRUE)
+	    set(LIBHILBERT_INCLUDE_DIRS ${LIBHILBERT_ROOT}/include)
+	    set(LIBHILBERT_LIBRARY_DIRS ${LIBHILBERT_ROOT}/lib)
+	    set(LIBHILBERT_LIBRARIES -llibhilbert)
+else()
+	    message(STATUS "Test LibHilbert_TEST_RUNS static linking - Failed")
+	    set(LIBHILBERT_TEST_RUNS FALSE)
+endif()
+
diff --git a/CMakeFiles/FindMetis.cmake b/CMakeFiles/FindMetis.cmake
new file mode 100644
index 00000000..cab0b085
--- /dev/null
+++ b/CMakeFiles/FindMetis.cmake
@@ -0,0 +1,187 @@
+# -*- mode: cmake -*-
+#
+# METIS Find Module for Femus
+# Shamelessly stolen from Amanzi open source code https://software.lanl.gov/ascem/trac
+#
+# Usage:
+#    Control the search through METIS_DIR or setting environment variable
+#    METIS_ROOT to the METIS installation prefix.
+#
+#    This module does not search default paths!
+#
+#    Following variables are set:
+#    METIS_FOUND            (BOOL)       Flag indicating if METIS was found
+#    METIS_INCLUDE_DIR      (PATH)       Path to the METIS include file
+#    METIS_INCLUDE_DIRS     (LIST)       List of all required include files
+#    METIS_LIBRARY_DIR      (PATH)       Path to the METIS library
+#    METIS_LIBRARY          (FILE)       METIS library
+#    METIS_LIBRARIES        (LIST)       List of all required METIS libraries
+#
+# #############################################################################
+
+# Standard CMake modules see CMAKE_ROOT/Modules
+include(FindPackageHandleStandardArgs)
+
+# Amanzi CMake functions see <root>/tools/cmake for source
+#include(PrintVariable)
+
+if ( METIS_LIBRARIES AND METIS_INCLUDE_DIRS )
+
+    # Do nothing. Variables are set. No need to search again
+
+else(METIS_LIBRARIES AND METIS_INCLUDE_DIRS)
+
+    # Cache variables
+    if(METIS_DIR)
+        set(METIS_DIR "${METIS_DIR}" CACHE PATH "Path to search for METIS include and library files")
+    endif()
+
+    if(METIS_INCLUDE_DIR)
+        set(METIS_INCLUDE_DIR "${METIS_INCLUDE_DIR}" CACHE PATH "Path to search for METIS include files")
+    endif()
+
+    if(METIS_LIBRARY_DIR)
+        set(METIS_LIBRARY_DIR "${METIS_LIBRARY_DIR}" CACHE PATH "Path to search for METIS library files")
+    endif()
+
+   
+    # Search for include files
+    # Search order preference:
+    #  (1) METIS_INCLUDE_DIR - check existence of path AND if the include files exist
+    #  (2) METIS_DIR/<include>
+    #  (3) Default CMake paths See cmake --html-help=out.html file for more information.
+    #
+    set(metis_inc_names "metis.h") 
+        
+    if (METIS_INCLUDE_DIR)
+
+        if (EXISTS "${METIS_INCLUDE_DIR}")
+
+            find_path(metis_test_include_path
+                      NAMES ${metis_inc_names}
+                      HINTS ${METIS_INCLUDE_DIR}
+                      NO_DEFAULT_PATH)
+            if(NOT metis_test_include_path)
+                message(SEND_ERROR "Can not locate ${metis_inc_names} in ${METIS_INCLUDE_DIR}")
+            endif()
+            set(METIS_INCLUDE_DIR "${metis_test_include_path}")
+
+        else()
+            message(SEND_ERROR "METIS_INCLUDE_DIR=${METIS_INCLUDE_DIR} does not exist")
+            set(METIS_INCLUDE_DIR "METIS_INCLUDE_DIR-NOTFOUND")
+        endif()
+
+   else() 
+
+# Metis sometimes puts the include files in a subdir called Lib
+
+        set(metis_inc_suffixes "include" "Lib")
+        if(METIS_DIR)
+
+            if (EXISTS "${METIS_DIR}" )
+
+                find_path(METIS_INCLUDE_DIR
+                          NAMES ${metis_inc_names}
+                          HINTS ${METIS_DIR}
+                          PATH_SUFFIXES ${metis_inc_suffixes}
+                          NO_DEFAULT_PATH)
+
+            else()
+                 message(SEND_ERROR "METIS_DIR=${METIS_DIR} does not exist")
+                 set(METIS_INCLUDE_DIR "METIS_INCLUDE_DIR-NOTFOUND")
+            endif()   
+
+
+        else()
+
+            find_path(METIS_INCLUDE_DIR
+                      NAMES ${metis_inc_names}
+                      PATH_SUFFIXES ${metis_inc_suffixes})
+
+        endif()
+
+    endif()
+
+
+    # Search for libraries
+    # Search order preference:
+    #  (1) METIS_LIBRARY_DIR - check existence of path AND if the library file exists
+    #  (2) METIS_DIR/<lib,Lib>
+    #  (3) Default CMake paths See cmake --html-help=out.html file for more information.
+    #
+    set(metis_lib_names "metis")
+
+    
+    if (METIS_LIBRARY_DIR)
+
+        if (EXISTS "${METIS_LIBRARY_DIR}")
+
+            find_library(METIS_LIBRARY
+                         NAMES ${metis_lib_names}
+                         HINTS ${METIS_LIBRARY_DIR}
+                         NO_DEFAULT_PATH)
+        else()
+            set(METIS_LIBRARY "METIS_LIBRARY-NOTFOUND")
+        endif()
+
+    else() 
+
+        list(APPEND metis_lib_suffixes "lib" "Lib")
+        if(METIS_DIR)
+
+            if (EXISTS "${METIS_DIR}" )
+
+                find_library(METIS_LIBRARY
+                             NAMES ${metis_lib_names}
+                             HINTS ${METIS_DIR}
+                             PATH_SUFFIXES ${metis_lib_suffixes}
+                             NO_DEFAULT_PATH)
+
+            else()
+                 set(METISLIBRARY "METIS_LIBRARY-NOTFOUND")
+            endif()   
+
+
+        else()
+
+            find_library(METIS_LIBRARY
+                         NAMES ${metis_lib_names}
+                         PATH_SUFFIXES ${metis_lib_suffixes})
+
+        endif()
+
+    endif()
+
+    if ( NOT METIS_LIBRARY )
+    endif()   
+
+   
+    # Define prerequisite packages
+    set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR})
+    set(METIS_LIBRARIES    ${METIS_LIBRARY})
+
+   
+endif(METIS_LIBRARIES AND METIS_INCLUDE_DIRS )   
+
+# Send useful message if everything is found
+find_package_handle_standard_args(METIS DEFAULT_MSG
+                                  METIS_LIBRARIES
+                                  METIS_INCLUDE_DIRS)
+
+# find_package_handle_standard_args should set METIS_FOUND but it does not!
+if ( METIS_LIBRARIES AND METIS_INCLUDE_DIRS)
+    set(METIS_FOUND TRUE)
+else()
+    set(METIS_FOUND FALSE)
+endif()
+
+# Define the version
+
+mark_as_advanced(
+  METIS_INCLUDE_DIR
+  METIS_INCLUDE_DIRS
+  METIS_LIBRARY
+  METIS_LIBRARIES
+  METIS_LIBRARY_DIR
+)
+
diff --git a/CMakeFiles/FindPETSc.cmake b/CMakeFiles/FindPETSc.cmake
new file mode 100644
index 00000000..ca71d4a6
--- /dev/null
+++ b/CMakeFiles/FindPETSc.cmake
@@ -0,0 +1,212 @@
+# - Try to find PETSc
+# Once done this will define
+#
+#  PETSC_FOUND             - system has PETSc
+#  PETSC_INCLUDE_DIRS      - include directories for PETSc
+#  PETSC_LIBRARY_DIRS      - library directories for PETSc
+#  PETSC_LIBRARIES         - libraries for PETSc
+#  PETSC_STATIC_LIBRARIES  - libraries for PETSc (static linking, undefined if not required)
+#  PETSC_VERSION           - version for PETSc
+#  PETSC_VERSION_MAJOR     - First number in PETSC_VERSION
+#  PETSC_VERSION_MINOR     - Second number in PETSC_VERSION
+#  PETSC_VERSION_SUBMINOR  - Third number in PETSC_VERSION
+#  PETSC_INT_SIZE          - sizeof(PetscInt)
+#
+#=============================================================================
+# Copyright (C) 2010-2016 Garth N. Wells, Anders Logg and Johannes Ring
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Outline:
+# 1. Get flags from PETSc-generated pkg-config file
+# 2. Test compile and run program using shared library linking
+# 3. If shared library linking fails, test with static library linking
+
+# Load pkg-config module (provided by CMake)
+find_package(PkgConfig REQUIRED)
+
+# Find PETSc pkg-config file. Note: craypetsc_real is on Cray systems
+set(ENV{PKG_CONFIG_PATH} "$ENV{CRAY_PETSC_PREFIX_DIR}/lib/pkgconfig:$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
+pkg_search_module(PETSC craypetsc_real PETSc)
+
+# Extract major, minor, etc from version string
+if (PETSC_VERSION)
+  string(REPLACE "." ";" VERSION_LIST ${PETSC_VERSION})
+  list(GET VERSION_LIST 0 PETSC_VERSION_MAJOR)
+  list(GET VERSION_LIST 1 PETSC_VERSION_MINOR)
+  list(GET VERSION_LIST 2 PETSC_VERSION_SUBMINOR)
+endif()
+
+# Configure PETSc IMPORT (this involves creating an 'imported' target
+# and attaching 'properties')
+if (PETSC_FOUND AND NOT TARGET PETSC::petsc)
+  add_library(PETSC::petsc INTERFACE IMPORTED)
+
+  # Add include paths
+  set_property(TARGET PETSC::petsc PROPERTY
+    INTERFACE_INCLUDE_DIRECTORIES ${PETSC_INCLUDE_DIRS})
+
+  # Add libraries
+  unset(_libs)
+  foreach (lib ${PETSC_LIBRARIES})
+    find_library(LIB_${lib} NAMES ${lib} PATHS ${PETSC_LIBRARY_DIRS} NO_DEFAULT_PATH)
+    list(APPEND _libs ${LIB_${lib}})
+  endforeach()
+  set_property(TARGET PETSC::petsc PROPERTY INTERFACE_LINK_LIBRARIES "${_libs}")
+endif()
+
+# Configure PETSc 'static' IMPORT (this involves creating an
+# 'imported' target and attaching 'properties')
+if (PETSC_FOUND AND NOT TARGET PETSC::petsc_static)
+  add_library(PETSC::petsc_static INTERFACE IMPORTED)
+
+  # Add libraries (static)
+  unset(_libs)
+  foreach (lib ${PETSC_STATIC_LIBRARIES})
+    find_library(LIB_${lib} ${lib} HINTS ${PETSC_STATIC_LIBRARY_DIRS})
+    list(APPEND _libs ${LIB_${lib}})
+  endforeach()
+  set_property(TARGET PETSC::petsc_static PROPERTY INTERFACE_LINK_LIBRARIES "${_libs}")
+endif()
+
+# Attempt to build and run PETSc test program
+if (DOLFIN_SKIP_BUILD_TESTS)
+
+  # Assume PETSc works
+  set(PETSC_TEST_RUNS TRUE)
+
+elseif (PETSC_FOUND)
+
+  # Create PETSc test program
+  set(PETSC_TEST_LIB_CPP
+    "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/petsc_test_lib.cpp")
+  file(WRITE ${PETSC_TEST_LIB_CPP} "
+#include \"petscts.h\"
+#include \"petsc.h\"
+int main()
+{
+  PetscErrorCode ierr;
+  TS ts;
+  int argc = 0;
+  char** argv = NULL;
+  ierr = PetscInitialize(&argc, &argv, PETSC_NULL, PETSC_NULL);CHKERRQ(ierr);
+  ierr = TSCreate(PETSC_COMM_WORLD,&ts);CHKERRQ(ierr);
+  ierr = TSSetFromOptions(ts);CHKERRQ(ierr);
+  ierr = TSDestroy(&ts);CHKERRQ(ierr);
+  ierr = PetscFinalize();CHKERRQ(ierr);
+  return 0;
+}
+")
+
+  # Add MPI variables if MPI has been found
+  if (MPI_C_FOUND)
+    set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${MPI_C_INCLUDE_PATH})
+    set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} ${MPI_C_LIBRARIES})
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${MPI_C_COMPILE_FLAGS}")
+  endif()
+
+  # Try to run test program (shared linking)
+  try_run(
+    PETSC_TEST_LIB_EXITCODE
+    PETSC_TEST_LIB_COMPILED
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${PETSC_TEST_LIB_CPP}
+    CMAKE_FLAGS
+    "-DINCLUDE_DIRECTORIES:STRING=${CMAKE_REQUIRED_INCLUDES}"
+    "-DLINK_LIBRARIES:STRING=${CMAKE_REQUIRED_LIBRARIES}"
+    LINK_LIBRARIES PETSC::petsc
+    COMPILE_OUTPUT_VARIABLE PETSC_TEST_LIB_COMPILE_OUTPUT
+    RUN_OUTPUT_VARIABLE PETSC_TEST_LIB_OUTPUT)
+
+  # Check program output
+  if (PETSC_TEST_LIB_COMPILED AND PETSC_TEST_LIB_EXITCODE EQUAL 0)
+
+    message(STATUS "Test PETSC_TEST_RUNS with shared library linking - Success")
+    set(PETSC_TEST_RUNS TRUE)
+
+    # Static libraries not required, so unset
+    set_property(TARGET PETSC::petsc_static PROPERTY INTERFACE_LINK_LIBRARIES)
+
+  else()
+
+    message(STATUS "Test PETSC_TEST_RUNS with shared library linking - Failed")
+
+    # Add MPI variables if MPI has been found
+    if (MPI_C_FOUND)
+      set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${MPI_C_INCLUDE_PATH})
+      set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} ${MPI_C_LIBRARIES})
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${MPI_C_COMPILE_FLAGS}")
+    endif()
+
+    # Try to run test program (static linking)
+    try_run(
+      PETSC_TEST_LIB_EXITCODE
+      PETSC_TEST_LIB_COMPILED
+      ${CMAKE_CURRENT_BINARY_DIR}
+      ${PETSC_TEST_LIB_CPP}
+      CMAKE_FLAGS
+      "-DINCLUDE_DIRECTORIES:STRING=${CMAKE_REQUIRED_INCLUDES}"
+      "-DLINK_LIBRARIES:STRING=${CMAKE_REQUIRED_LIBRARIES}"
+      LINK_LIBRARIES PETSC::petsc PETSC::petsc_static
+      COMPILE_OUTPUT_VARIABLE PETSC_TEST_LIB_COMPILE_OUTPUT
+      RUN_OUTPUT_VARIABLE PETSC_TEST_LIB_OUTPUT)
+
+    if (PETSC_TEST_LIB_COMPILED AND PETSC_TEST_LIB_EXITCODE EQUAL 0)
+      message(STATUS "Test PETSC_TEST_RUNS static linking - Success")
+      set(PETSC_TEST_RUNS TRUE)
+    else()
+      message(STATUS "Test PETSC_TEST_RUNS static linking - Failed")
+      set(PETSC_TEST_RUNS FALSE)
+    endif()
+
+  endif()
+endif()
+
+# Check sizeof(PetscInt)
+if (PETSC_INCLUDE_DIRS)
+  include(CheckTypeSize)
+  set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${PETSC_INCLUDE_DIRS})
+  set(CMAKE_EXTRA_INCLUDE_FILES petscsys.h)
+  check_type_size("PetscInt" PETSC_INT_SIZE)
+
+  unset(CMAKE_EXTRA_INCLUDE_FILES)
+  unset(CMAKE_REQUIRED_INCLUDES)
+endif()
+
+# Standard package handling
+include(FindPackageHandleStandardArgs)
+if (PETSC_FOUND)
+  find_package_handle_standard_args(PETSc
+    REQUIRED_VARS PETSC_FOUND PETSC_TEST_RUNS VERSION_VAR PETSC_VERSION
+    FAIL_MESSAGE "PETSc could not be configured.")
+else()
+  find_package_handle_standard_args(PETSc
+    REQUIRED_VARS PETSC_FOUND
+    FAIL_MESSAGE "PETSc could not be found. Be sure to set PETSC_DIR.")
+endif()
+
+
diff --git a/CMakeFiles/FindParMetis.cmake b/CMakeFiles/FindParMetis.cmake
new file mode 100644
index 00000000..8c7ca8b5
--- /dev/null
+++ b/CMakeFiles/FindParMetis.cmake
@@ -0,0 +1,164 @@
+# - Try to find ParMETIS
+# Once done this will define
+#
+#  PARMETIS_FOUND        - system has ParMETIS
+#  PARMETIS_INCLUDE_DIRS - include directories for ParMETIS
+#  PARMETIS_LIBRARIES    - libraries for ParMETIS
+#
+# Variables used by this module. They can change the default behaviour and
+# need to be set before calling find_package:
+#
+#  PARMETIS_DIR          - Prefix directory of the ParMETIS installation
+#  PARMETIS_INCLUDE_DIR  - Include directory of the ParMETIS installation
+#                          (set only if different from ${PARMETIS_DIR}/include)
+#  PARMETIS_LIB_DIR      - Library directory of the ParMETIS installation
+#                          (set only if different from ${PARMETIS_DIR}/lib)
+#  PARMETIS_TEST_RUNS    - Skip tests building and running a test
+#                          executable linked against ParMETIS libraries
+#  PARMETIS_LIB_SUFFIX   - Also search for non-standard library names with the
+#                          given suffix appended
+
+#=============================================================================
+# Copyright (C) 2010-2012 Garth N. Wells, Anders Logg, Johannes Ring
+# and Florian Rathgeber. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+if(NOT PARMETIS_INCLUDE_DIR)
+  find_path(PARMETIS_INCLUDE_DIR parmetis.h
+    HINTS ${PARMETIS_INCLUDE_DIR} ENV PARMETIS_INCLUDE_DIR ${PARMETIS_DIR} ENV PARMETIS_DIR
+    PATH_SUFFIXES include
+    DOC "Directory where the ParMETIS header files are located"
+  )
+endif()
+
+if(NOT METIS_INCLUDE_DIR)
+  find_path(METIS_INCLUDE_DIR metis.h
+    HINTS ${METIS_INCLUDE_DIR} ENV METIS_INCLUDE_DIR ${METIS_DIR} ENV METIS_DIR
+    PATH_SUFFIXES include
+    DOC "Directory where the METIS header files are located"
+  )
+endif()
+
+if(PARMETIS_LIBRARIES)
+  set(PARMETIS_LIBRARY ${PARMETIS_LIBRARIES})
+endif()
+if(NOT PARMETIS_LIBRARY)
+  find_library(PARMETIS_LIBRARY
+    NAMES parmetis parmetis${PARMETIS_LIB_SUFFIX}
+    HINTS ${PARMETIS_LIB_DIR} ENV PARMETIS_LIB_DIR ${PARMETIS_DIR} ENV PARMETIS_DIR
+    PATH_SUFFIXES lib
+    DOC "Directory where the ParMETIS library is located"
+  )
+endif()
+
+if(METIS_LIBRARIES)
+  set(METIS_LIBRARY ${METIS_LIBRARIES})
+endif()
+if(NOT METIS_LIBRARY)
+  find_library(METIS_LIBRARY
+    NAMES metis metis${PARMETIS_LIB_SUFFIX}
+    HINTS ${PARMETIS_LIB_DIR} ENV PARMETIS_LIB_DIR ${PARMETIS_DIR} ENV PARMETIS_DIR
+    PATH_SUFFIXES lib
+    DOC "Directory where the METIS library is located"
+  )
+endif()
+
+# Get ParMETIS version
+if(NOT PARMETIS_VERSION_STRING AND PARMETIS_INCLUDE_DIR AND EXISTS "${PARMETIS_INCLUDE_DIR}/parmetis.h")
+  set(version_pattern "^#define[\t ]+PARMETIS_(MAJOR|MINOR)_VERSION[\t ]+([0-9\\.]+)$")
+  file(STRINGS "${PARMETIS_INCLUDE_DIR}/parmetis.h" parmetis_version REGEX ${version_pattern})
+
+  foreach(match ${parmetis_version})
+    if(PARMETIS_VERSION_STRING)
+      set(PARMETIS_VERSION_STRING "${PARMETIS_VERSION_STRING}.")
+    endif()
+    string(REGEX REPLACE ${version_pattern} "${PARMETIS_VERSION_STRING}\\2" PARMETIS_VERSION_STRING ${match})
+    set(PARMETIS_VERSION_${CMAKE_MATCH_1} ${CMAKE_MATCH_2})
+  endforeach()
+  unset(parmetis_version)
+  unset(version_pattern)
+endif()
+
+# Try compiling and running test program
+if(PARMETIS_INCLUDE_DIR AND METIS_INCLUDE_DIR AND
+   PARMETIS_LIBRARY AND METIS_LIBRARY)
+
+  # Test requires MPI
+  find_package(MPI QUIET REQUIRED)
+
+  # Set flags for building test program
+  set(CMAKE_REQUIRED_FLAGS "${MPI_C_COMPILE_FLAGS}")
+  # Ideally this would be used, but it unfortunately is not supported
+  #set(CMAKE_REQUIRED_LINKER_FLAGS "${MPI_C_LINK_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+  set(CMAKE_REQUIRED_INCLUDES
+    ${PARMETIS_INCLUDE_DIR} ${METIS_INCLUDE_DIR} ${MPI_C_INCLUDE_PATH})
+  set(CMAKE_REQUIRED_LIBRARIES
+    ${PARMETIS_LIBRARY} ${METIS_LIBRARY} ${MPI_C_LIBRARIES})
+
+  # Build and run test program
+  include(CheckCSourceRuns)
+  check_c_source_runs("
+#include \"mpi.h\"
+#define METIS_EXPORT
+#include \"parmetis.h\"
+int main( int argc, char* argv[] )
+{
+  // FIXME: Find a simple but sensible test for ParMETIS
+  MPI_Init( &argc, &argv );
+  MPI_Finalize();
+  return 0;
+}
+" PARMETIS_TEST_RUNS)
+
+  unset(CMAKE_REQUIRED_FLAGS)
+  #unset(CMAKE_REQUIRED_LINKER_FLAGS)
+  unset(CMAKE_REQUIRED_INCLUDES)
+  unset(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+# Standard package handling
+include(FindPackageHandleStandardArgs)
+if(CMAKE_VERSION VERSION_GREATER 2.8.2)
+  find_package_handle_standard_args(ParMETIS
+    REQUIRED_VARS PARMETIS_LIBRARY PARMETIS_INCLUDE_DIR PARMETIS_TEST_RUNS
+    VERSION_VAR PARMETIS_VERSION_STRING)
+else()
+  find_package_handle_standard_args(ParMETIS
+    REQUIRED_VARS PARMETIS_LIBRARY PARMETIS_INCLUDE_DIR PARMETIS_TEST_RUNS)
+endif()
+
+if(PARMETIS_FOUND)
+  set(PARMETIS_LIBRARIES ${PARMETIS_LIBRARY} ${METIS_LIBRARY})
+  set(PARMETIS_INCLUDE_DIRS ${PARMETIS_INCLUDE_DIR} ${METIS_INCLUDE_DIR})
+else()
+  unset(METIS_LIBRARY CACHE)
+  unset(METIS_INCLUDE_DIR CACHE)
+endif()
+
+mark_as_advanced(PARMETIS_INCLUDE_DIR METIS_INCLUDE_DIR
+  PARMETIS_LIBRARY METIS_LIBRARY)
+
diff --git a/example/Numerics/Vortex_in_cell/main_vic_petsc_double.cpp b/example/Numerics/Vortex_in_cell/main_vic_petsc_double.cpp
new file mode 100644
index 00000000..733ce842
--- /dev/null
+++ b/example/Numerics/Vortex_in_cell/main_vic_petsc_double.cpp
@@ -0,0 +1,1287 @@
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Vortex in Cell 3D ring with ringlets # {#vic_ringlets}
+ *
+ * In this example we solve the Navier-Stokes equation in the vortex formulation in 3D
+ * for an incompressible fluid. (bold symbols are vectorial quantity)
+ *
+ * ## Numerical method ## {#num_vic_mt}
+ *
+ * In this code we solve the Navier-stokes equation for incompressible fluid in the
+ * vorticity formulation. We first recall the Navier-stokes equation in vorticity formulation
+ *
+ * \f$ \nabla \times \boldsymbol u = - \boldsymbol w \f$
+ *
+ * \f$  \frac{\displaystyle D \boldsymbol w}{\displaystyle dt} = ( \boldsymbol w \cdot \vec \nabla) \boldsymbol u + \nu \nabla^{2} \boldsymbol w \f$    (5)
+ *
+ * Where \f$w\f$ is the vorticity and \f$u\f$ is the velocity of the fluid.
+ * With high Reynold number \f$Re = \frac{uL}{\nu}\f$ and the term \f$uL\f$ significantly
+ * smaller than the reynold number we have that \f$\nu\f$ is small and the term \f$\nu \nabla^{2} w\f$ is
+ * negligible. The algorithm can be expressed with the following pseudo code.
+ *
+ * \verbatim
+
+	1) Initialize the vortex ring on grid
+	2) Do an helmotz hodge projection to make the vorticity divergent free
+	3) Initialize particles on the same position as the grid or remesh
+
+	while (t < t_end) do
+		4) Interpolate vorticity from the particles to mesh
+		5) calculate velocity u from the vorticity w
+		6) calculate the right-hand-side on grid and interpolate on particles
+		7) interpolate velocity u to particles
+		8) move particles accordingly to the velocity
+		9) interpolate the vorticity into mesh and reinitialize the particles
+		   in a grid like position
+	end while
+
+   \endverbatim
+ *
+ * This pseudo code show how to solve the equation above using euler integration.
+ * In case of Runge-kutta of order two the pseudo code change into
+ *
+ *
+ * \verbatim
+
+	1) Initialize the vortex ring on grid
+	2) Do an helmotz hodge projection to make the vorticity divergent free
+	3) Initialize particles on the same position as the grid or remesh
+
+	while (t < t_end) do
+		4) 4) Interpolate vorticity from the particles to mesh
+		5) calculate velocity u from the vorticity w
+		6) calculate the right-hand-side on grid and interpolate on particles
+		7) interpolate velocity u to particles
+		8) move particles accordingly to the velocity and save the old position in x_old
+
+		9) Interpolate vorticity on mesh on the particles
+		10) calculate velocity u from the vorticity w
+		11) calculate the right-hand-side on grid and interpolate on particles
+		12) interpolate velocity u to particles
+		13) move particles accordingly to the velocity starting from x_old
+		14) interpolate the vorticity into mesh and reinitialize the particles
+		   in a grid like position
+	end while
+
+   \endverbatim
+ *
+ * In the following we explain how each step is implemented in the code
+ *
+ * ## Inclusion ## {#num_vic_inc}
+ *
+ * This example code need several components. First because is a particle
+ * mesh example we have to activate **grid_dist_id.hpp** and **vector_dist_id.hpp**.
+ * Because we use a finite-difference scheme and linear-algebra to calculate the
+ * velocity out of the vorticity, we have to include **FDScheme.hpp** to produce
+ * from the finite difference scheme a matrix that represent the linear-system
+ * to solve. **SparseMatrix.hpp** is the Sparse-Matrix that will contain the linear
+ * system to solve in order to get the velocity out of the vorticity.
+ * **Vector.hpp** is the data-structure that contain the solution of the
+ * linear system. **petsc_solver.hpp** is the library to use in order invert the linear system.
+ * Because we have to interpolate between particles and grid we the to include
+ * **interpolate.hpp** as interpolation kernel we use the mp4, so we include the
+ * **mp4_kernel.hpp**
+ *
+ * For convenience we also define the particles type and the grid type and some
+ * convenient constants
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp inclusion
+ *
+ *
+ *
+ */
+
+//#define SE_CLASS1
+//#define PRINT_STACKTRACE
+
+//! \cond [inclusion] \endcond
+
+#include "interpolation/interpolation.hpp"
+#include "Grid/grid_dist_id.hpp"
+#include "Vector/vector_dist.hpp"
+#include "Matrix/SparseMatrix.hpp"
+#include "Vector/Vector.hpp"
+#include "FiniteDifference/FDScheme.hpp"
+#include "Solvers/petsc_solver.hpp"
+#include "interpolation/mp4_kernel.hpp"
+#include "Solvers/petsc_solver_AMG_report.hpp"
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+// The type of the grids
+typedef grid_dist_id<3,double,aggregate<double[3]>> grid_type;
+
+// The type of the particles
+typedef vector_dist<3,double,aggregate<double[3],double[3],double[3],double[3],double[3]>> particles_type;
+
+// radius of the torus
+double ringr1 = 1.0;
+// radius of the core of the torus
+double sigma = 1.0/3.523;
+// Reynold number
+double tgtre  = 7500.0;
+// Noise factor for the ring vorticity on z
+double ringnz = 0.01;
+
+// Kinematic viscosity
+double nu = 1.0/tgtre;
+
+// Time step
+double dt = 0.025;
+
+// All the properties by index
+constexpr unsigned int vorticity = 0;
+constexpr unsigned int velocity = 0;
+constexpr unsigned int p_vel = 1;
+constexpr int rhs_part = 2;
+constexpr unsigned int old_vort = 3;
+constexpr unsigned int old_pos = 4;
+
+//! \cond [inclusion] \endcond
+
+template<typename grid> void calc_and_print_max_div_and_int(grid & g_vort)
+{
+	//! \cond [sanity_int_div] \endcond
+
+	g_vort.template ghost_get<vorticity>();
+	auto it5 = g_vort.getDomainIterator();
+
+	double max_vort = 0.0;
+
+	double int_vort[3] = {0.0,0.0,0.0};
+
+	while (it5.isNext())
+	{
+		auto key = it5.get();
+
+		double tmp;
+
+        tmp = 1.0f/g_vort.spacing(x)/2.0f*(g_vort.template get<vorticity>(key.move(x,1))[x] - g_vort.template get<vorticity>(key.move(x,-1))[x] ) +
+			         	 	 1.0f/g_vort.spacing(y)/2.0f*(g_vort.template get<vorticity>(key.move(y,1))[y] - g_vort.template get<vorticity>(key.move(y,-1))[y] ) +
+							 1.0f/g_vort.spacing(z)/2.0f*(g_vort.template get<vorticity>(key.move(z,1))[z] - g_vort.template get<vorticity>(key.move(z,-1))[z] );
+
+        int_vort[x] += g_vort.template get<vorticity>(key)[x];
+        int_vort[y] += g_vort.template get<vorticity>(key)[y];
+        int_vort[z] += g_vort.template get<vorticity>(key)[z];
+
+        if (tmp > max_vort)
+        	max_vort = tmp;
+
+		++it5;
+	}
+
+	Vcluster & v_cl = create_vcluster();
+	v_cl.max(max_vort);
+	v_cl.sum(int_vort[0]);
+	v_cl.sum(int_vort[1]);
+	v_cl.sum(int_vort[2]);
+	v_cl.execute();
+
+	if (v_cl.getProcessUnitID() == 0)
+	{std::cout << "Max div for vorticity " << max_vort << "   Integral: " << int_vort[0] << "  " << int_vort[1] << "   " << int_vort[2] << std::endl;}
+
+	//! \cond [sanity_int_div] \endcond
+}
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 1: Initialization of the vortex ring # {#vic_ring_init}
+ *
+ * In this function we initialize the vortex ring. The vortex ring is
+ * initialized accordingly to these formula.
+ *
+ * \f$ w(t = 0) =  \frac{\Gamma}{\pi \sigma^{2}} e^{-(s/ \sigma)^2} \f$
+ *
+ * \f$ s^2 = (z-z_c)^{2} + ((x-x_c)^2 + (y-y_c)^2 - R^2) \f$
+ *
+ * \f$ \Gamma = \nu Re \f$
+ *
+ * With this initialization the vortex ring look like the one in figure
+ *
+ * \image html int_vortex_arrow_small.jpg "Vortex ring initialization the arrow indicate the direction where the vortex point while the colors indicate the magnitude from blue (low) to red (high)"
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp init_vort
+ *
+ *
+ *
+ */
+
+//! \cond [init_vort] \endcond
+
+/*
+ * gr is the grid where we are initializing the vortex ring
+ * domain is the simulation domain
+ *
+ */
+void init_ring(grid_type & gr, const Box<3,double> & domain)
+{
+	// To add some noise to the vortex ring we create two random
+	// vector
+	constexpr int nk = 32;
+
+	double ak[nk];
+	double bk[nk];
+
+	for (size_t i = 0 ; i < nk ; i++)
+	{
+	     ak[i] = rand()/RAND_MAX;
+	     bk[i] = rand()/RAND_MAX;
+	}
+
+	// We calculate the circuitation gamma
+	double gamma = nu * tgtre;
+	double rinv2 = 1.0f/(sigma*sigma);
+	double max_vorticity = gamma*rinv2/M_PI;
+
+	// We go through the grid to initialize the vortex
+	auto it = gr.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto key_d = it.get();
+		auto key = it.getGKey(key_d);
+
+        double tx = (key.get(x)-2)*gr.spacing(x) + domain.getLow(x);
+        double ty = (key.get(y)-2)*gr.spacing(y) + domain.getLow(y);
+        double tz = (key.get(z)-2)*gr.spacing(z) + domain.getLow(z);
+        double theta1 = atan2((ty-2.5f),(tz-2.5f));
+
+
+
+        double noise = 0.0f;
+ //       for (int kk=1 ; kk < nk; kk++)
+ //       	noise = noise + sin(kk*(theta1+2.0f*M_PI*ak[kk])) + cos(kk*(theta1+2.0f*M_PI*bk[kk]));
+
+        double rad1r  = sqrt((ty-2.5f)*(ty-2.5f) + (tz-2.5f)*(tz-2.5f)) - ringr1*(1.0f + ringnz * noise);
+        double rad1t = tx - 1.0f;
+        double rad1sq = rad1r*rad1r + rad1t*rad1t;
+        double radstr = -exp(-rad1sq*rinv2)*rinv2*gamma/M_PI;
+        gr.template get<vorticity>(key_d)[x] = 0.0f;
+        gr.template get<vorticity>(key_d)[y] = -radstr * cos(theta1);
+        gr.template get<vorticity>(key_d)[z] = radstr * sin(theta1);
+
+
+		++it;
+	}
+}
+
+//! \cond [init_vort] \endcond
+
+//! \cond [poisson_syseq] \endcond
+
+// Specification of the poisson equation for the helmotz-hodge projection
+// 3D (dims = 3). The field is a scalar value (nvar = 1), bournary are periodic
+// type of the the space is double. Final grid that will store \phi, the result (grid_dist_id<.....>)
+// The other indicate which kind of Matrix to use to construct the linear system and
+// which kind of vector to construct for the right hand side. Here we use a PETSC Sparse Matrix
+// and PETSC vector. NORMAL_GRID indicate that is a standard grid (non-staggered)
+struct poisson_nn_helm
+{
+        static const unsigned int dims = 3;
+        static const unsigned int nvar = 1;
+        static const bool boundary[];
+        typedef double stype;
+        typedef grid_dist_id<3,double,aggregate<double>> b_grid;
+        typedef SparseMatrix<double,int,PETSC_BASE> SparseMatrix_type;
+        typedef Vector<double,PETSC_BASE> Vector_type;
+        static const int grid_type = NORMAL_GRID;
+};
+
+const bool poisson_nn_helm::boundary[] = {PERIODIC,PERIODIC,PERIODIC};
+
+//! \cond [poisson_syseq] \endcond
+
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 2: Helmotz-hodge projection # {#vic_hlm_proj}
+ *
+ * The Helnotz hodge projection is required in order to make the vorticity divergent
+ * free. The Helmotz-holde projection work in this way. A field can be divided into
+ * a curl-free part and a divergent-free part.
+ *
+ * \f$ w = w_{rot} + w_{div} \f$
+ *
+ * with
+ *
+ * \f$ \vec \nabla \times w_{rot} = 0 \f$
+ *
+ * \f$  \nabla \cdot w_{div} = 0 \f$
+ *
+ * To have a vorticity divergent free we have to get the component (3) \f$w_{div} = w - w_{rot}\f$.
+ * In particular it hold
+ *
+ * \f$ \nabla \cdot w = \nabla \cdot w_{rot} \f$
+ *
+ * Bacause \f$ \vec \nabla \times w_{rot} = 0 \f$ we can introduce a field \f$ \psi \f$
+ * such that
+ *
+ * (2) \f$ w_{rot} = \vec \nabla \psi \f$
+ *
+ * Doing the  \f$  \nabla \cdot \vec \nabla \psi \f$ we obtain
+ *
+ * \f$ \nabla \cdot \vec \nabla \psi = \nabla^{2} \psi = \nabla \cdot w_{rot} = \nabla \cdot w \f$
+ *
+ * so we lead to this equation
+ *
+ * (1) \f$ \nabla^{2} \psi = \nabla \cdot w  \f$
+ *
+ * Solving the equation for \f$ \psi \f$ we can obtain \f$ w_{rot} \f$ doing the gradient of \f$ \psi \f$
+ * and finally correct \f$ w \f$ obtaining \f$ w_{div} \f$
+ *
+ * The **helmotz_hodge_projection** function do this correction to the vorticity
+ *
+ * In particular it solve the equation (1) it calculate \f$ w_{rot} \f$
+ * using (2) and correct the vorticity using using (3)
+ *
+ *
+ * ## Poisson equation ##
+ *
+ * To solve a poisson equation on a grid using finite-difference, we need to create
+ * an object that carry information about the system of equations
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp poisson_syseq
+ *
+ * Once created this object we can define the equation we are trying to solve.
+ * In particular the code below define the left-hand-side of the equation \f$ \nabla^{2} \psi \f$
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp poisson_obj_eq
+ *
+ * Before to construct the linear system we also calculate the divergence of the
+ * vorticity \f$ \nabla \cdot w \f$ that will be the right-hand-side
+ * of the equation
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp calc_div_vort
+ *
+ * Finally we can create the object FDScheme using the object **poisson_nn_helm**
+ * as template variable. In addition to the constructor we have to specify the maximum extension of the stencil, the domain and the
+ * grid that will store the result. At this point we can impose an equation to construct
+ * our SparseMatrix. In this example we are imposing the poisson equation with right hand
+ * side equal to the divergence of vorticity (note: to avoid to create another field we use
+ *  \f$ \psi \f$ to preliminary store the divergence of the vorticity). Imposing the
+ *  equations produce an invertible SparseMatrix **A** and a right-hand-side Vector **b**.
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp create_fdscheme
+ *
+ * Because we need \f$ x = A^{-1}b \f$. We have to invert and solve a linear system.
+ * In this case we use the Conjugate-gradient-Method an iterative solver. Such method
+ * is controlled by two parameters. One is the tollerance that determine when the
+ * method is converged, the second one is the maximum number of iterations to avoid that
+ * the method go into infinite loop. After we set the parameters of the solver we can the
+ * the solution **x**. Finaly we copy back the solution **x** into the grid \f$ \psi \f$.
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp solve_petsc
+ *
+ * ### Note ###
+ *
+ * Because we are solving the poisson equation in periodic boundary conditions the Matrix has
+ * determinat equal to zero. This mean that \f$ \psi \f$ has no unique solution (if it has one).
+ * In order to recover one, we have to ensure that the integral of the righ hand side or vorticity
+ * is zero. (In our case is the case). We have to ensure that across time the integral of the
+ * vorticity is conserved. (In our case is the case if we consider the \f$ \nu = 0 \f$ and \f$
+ * \nabla \cdot w = 0 \f$ we can rewrite (5) in a conservative way \f$  \frac{Dw}{dt} = div(w \otimes v) \f$ ).
+ * Is also good to notice that the solution that you get is the one with \f$ \int w  = 0 \f$
+ *
+ *
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp solve_petsc
+ *
+ * ## Correction ## {#vort_correction}
+ *
+ * After we got our solution for \f$ \psi \f$ we can calculate the correction of the vorticity
+ * doing the gradient of \f$ \psi \f$.
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp vort_correction
+ *
+ * We also do a sanity check and we control that the vorticity remain
+ * divergent-free. Getting the maximum value of the divergence and printing out
+ * its value
+ *
+ *
+ */
+
+/*
+ * gr vorticity grid where we apply the correction
+ * domain simulation domain
+ *
+ */
+void helmotz_hodge_projection(grid_type & gr, const Box<3,double> & domain, petsc_solver<double> & solver, petsc_solver<double>::return_type & x_ , bool init)
+{
+	///////////////////////////////////////////////////////////////
+
+	 //! \cond [poisson_obj_eq] \endcond
+
+	// Convenient constant
+	constexpr unsigned int phi = 0;
+
+	// We define a field phi_f
+	typedef Field<phi,poisson_nn_helm> phi_f;
+
+	// We assemble the poisson equation doing the
+	// poisson of the Laplacian of phi using Laplacian
+	// central scheme (where the both derivative use central scheme, not
+	// forward composed backward like usually)
+	typedef Lap<phi_f,poisson_nn_helm,CENTRAL_SYM> poisson;
+
+	//! \cond [poisson_obj_eq] \endcond
+
+	//! \cond [calc_div_vort] \endcond
+
+	// ghost get
+	gr.template ghost_get<vorticity>();
+
+	// ghost size of the psi function
+    Ghost<3,long int> g(2);
+
+	// Here we create a distributed grid to store the result of the helmotz projection
+	grid_dist_id<3,double,aggregate<double>> psi(gr.getDecomposition(),gr.getGridInfo().getSize(),g);
+
+	// Calculate the divergence of the vortex field
+	auto it = gr.getDomainIterator();	
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+        psi.template get<phi>(key) = 1.0f/gr.spacing(x)/2.0f*(gr.template get<vorticity>(key.move(x,1))[x] - gr.template get<vorticity>(key.move(x,-1))[x] ) +
+			         	 	 1.0f/gr.spacing(y)/2.0f*(gr.template get<vorticity>(key.move(y,1))[y] - gr.template get<vorticity>(key.move(y,-1))[y] ) +
+							 1.0f/gr.spacing(z)/2.0f*(gr.template get<vorticity>(key.move(z,1))[z] - gr.template get<vorticity>(key.move(z,-1))[z] );
+
+		++it;
+	}
+
+	//! \cond [calc_div_vort] \endcond
+
+
+	calc_and_print_max_div_and_int(gr);
+
+
+	//! \cond [create_fdscheme] \endcond
+
+	// In order to create a matrix that represent the poisson equation we have to indicate
+	// we have to indicate the maximum extension of the stencil and we we need an extra layer
+	// of points in case we have to impose particular boundary conditions.
+	Ghost<3,long int> stencil_max(2);
+
+	// Here we define our Finite difference disctetization scheme object
+	FDScheme<poisson_nn_helm> fd(stencil_max, domain, psi);
+
+	// impose the poisson equation, using right hand side psi on the full domain (psi.getDomainIterator)
+	// the template paramereter instead define which property of phi carry the righ-hand-side
+	// in this case phi has only one property, so the property 0 carry the right hand side
+	fd.template impose_dit<0>(poisson(),psi,psi.getDomainIterator());
+
+	//! \cond [create_fdscheme] \endcond
+
+	//! \cond [solve_petsc] \endcond
+
+	timer tm_solve;
+	if (init == true)
+	{
+		// Set the conjugate-gradient as method to solve the linear solver
+		solver.setSolver(KSPBCGS);
+
+		// Set the absolute tolerance to determine that the method is converged
+		solver.setAbsTol(0.001);
+
+		// Set the maximum number of iterations
+		solver.setMaxIter(500);
+
+        solver.setPreconditioner(PCHYPRE_BOOMERAMG);
+        solver.setPreconditionerAMG_nl(6);
+        solver.setPreconditionerAMG_maxit(1);
+        solver.setPreconditionerAMG_relax("SOR/Jacobi");
+        solver.setPreconditionerAMG_cycleType("V",0,4);
+        solver.setPreconditionerAMG_coarsenNodalType(0);
+        solver.setPreconditionerAMG_coarsen("HMIS");
+
+		tm_solve.start();
+
+		// Give to the solver A and b, return x, the solution
+		solver.solve(fd.getA(),x_,fd.getB());
+
+		tm_solve.stop();
+	}
+	else
+	{
+		tm_solve.start();
+		solver.solve(x_,fd.getB());
+		tm_solve.stop();
+	}
+
+	// copy the solution x to the grid psi
+	fd.template copy<phi>(x_,psi);
+
+	//! \cond [solve_petsc] \endcond
+
+	//! \cond [vort_correction] \endcond
+
+	psi.template ghost_get<phi>();
+
+	// Correct the vorticity to make it divergence free
+
+	auto it2 = gr.getDomainIterator();
+
+	while (it2.isNext())
+	{
+		auto key = it2.get();
+
+		gr.template get<vorticity>(key)[x] -= 1.0f/2.0f/psi.spacing(x)*(psi.template get<phi>(key.move(x,1))-psi.template get<phi>(key.move(x,-1)));
+		gr.template get<vorticity>(key)[y] -= 1.0f/2.0f/psi.spacing(y)*(psi.template get<phi>(key.move(y,1))-psi.template get<phi>(key.move(y,-1)));
+		gr.template get<vorticity>(key)[z] -= 1.0f/2.0f/psi.spacing(z)*(psi.template get<phi>(key.move(z,1))-psi.template get<phi>(key.move(z,-1)));
+
+		++it2;
+	}
+
+	//! \cond [vort_correction] \endcond
+
+	calc_and_print_max_div_and_int(gr);
+}
+
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 3: Remeshing vorticity # {#vic_remesh_vort}
+ *
+ * After that we initialized the vorticity on the grid, we initialize the particles
+ * in a grid like position and we interpolate the vorticity on particles. Because
+ * of the particles position being in a grid-like position and the symmetry of the
+ * interpolation kernels, the re-mesh step simply reduce to initialize the particle
+ * in a grid like position and assign the property vorticity of the particles equal to the
+ * grid vorticity.
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp remesh_part
+ *
+ */
+
+//! \cond [remesh_part] \endcond
+
+void remesh(particles_type & vd, grid_type & gr,Box<3,double> & domain)
+{
+	// Remove all particles because we reinitialize in a grid like position
+	vd.clear();
+
+	// Get a grid iterator
+	auto git = vd.getGridIterator(gr.getGridInfo().getSize());
+
+	// For each grid point
+	while (git.isNext())
+	{
+		// Get the grid point in global coordinates (key). p is in local coordinates
+		auto p = git.get();
+		auto key = git.get_dist();
+
+		// Add the particle
+		vd.add();
+
+		// Assign the position to the particle in a grid like way
+		vd.getLastPos()[x] = gr.spacing(x)*p.get(x) + domain.getLow(x);
+		vd.getLastPos()[y] = gr.spacing(y)*p.get(y) + domain.getLow(y);
+		vd.getLastPos()[z] = gr.spacing(z)*p.get(z) + domain.getLow(z);
+
+		// Initialize the vorticity
+		vd.template getLastProp<vorticity>()[x] = gr.template get<vorticity>(key)[x];
+		vd.template getLastProp<vorticity>()[y] = gr.template get<vorticity>(key)[y];
+		vd.template getLastProp<vorticity>()[z] = gr.template get<vorticity>(key)[z];
+
+		// next grid point
+		++git;
+	}
+
+	// redistribute the particles
+	vd.map();
+}
+
+//! \cond [remesh_part] \endcond
+
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 5: Compute velocity from vorticity # {#vic_vel_from_vort}
+ *
+ * Computing the velocity from vorticity is done in the following way. Given
+ *
+ * \f$ \vec \nabla \times u = -w \f$
+ *
+ * We intrododuce the stream line function defined as
+ *
+ * \f$ \nabla \times \phi = u \f$ (7)
+ *
+ * \f$ \nabla \cdot \phi = 0 \f$
+ *
+ * We obtain
+ *
+ * \f$ \nabla \times \nabla \times \phi = -w = \vec \nabla (\nabla \cdot  \phi) - \nabla^{2} \phi  \f$
+ *
+ * Because the divergence of \f$ \phi \f$ is defined to be zero we have
+ *
+ * \f$ \nabla^{2} \phi = w \f$
+ *
+ * The velocity can be recovered by the equation (7)
+ *
+ * Putting into code what explained before, we again generate a poisson
+ * object
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp poisson_obj_eq
+ *
+ * In order to calculate the velocity out of the vorticity, we solve a poisson
+ * equation like we did in helmotz-projection equation, but we do it for each
+ * component \f$ i \f$ of the vorticity. Qnce we have the solution in **psi_s**
+ * we copy the result back into the grid **gr_ps**. We than calculate the
+ * quality of the solution printing the norm infinity of the residual and
+ * finally we save in the grid vector vield **phi_v** the compinent \f$ i \f$
+ * (Copy from phi_s to phi_v is necessary because in phi_s is not a grid
+ * and cannot be used as a grid like object)
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp solve_poisson_comp
+ *
+ * We save the component \f$ i \f$ of \f$ \phi \f$ into **phi_v**
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp copy_to_phi_v
+ *
+ * Once we filled phi_v we can implement (7) and calculate the curl of **phi_v**
+ * to recover the velocity v
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp curl_phi_v
+ *
+ *
+ */
+
+void comp_vel(Box<3,double> & domain, grid_type & g_vort,grid_type & g_vel, petsc_solver<double>::return_type (& phi_s)[3],petsc_solver<double> & solver)
+{
+	//! \cond [poisson_obj_eq] \endcond
+
+	// Convenient constant
+	constexpr unsigned int phi = 0;
+
+	// We define a field phi_f
+	typedef Field<phi,poisson_nn_helm> phi_f;
+
+	// We assemble the poisson equation doing the
+	// poisson of the Laplacian of phi using Laplacian
+	// central scheme (where the both derivative use central scheme, not
+	// forward composed backward like usually)
+	typedef Lap<phi_f,poisson_nn_helm,CENTRAL_SYM> poisson;
+
+	//! \cond [poisson_obj_eq] \endcond
+
+	// Maximum size of the stencil
+	Ghost<3,long int> g(2);
+
+	// Here we create a distributed grid to store the result
+	grid_dist_id<3,double,aggregate<double>> gr_ps(g_vort.getDecomposition(),g_vort.getGridInfo().getSize(),g);
+	grid_dist_id<3,double,aggregate<double[3]>> phi_v(g_vort.getDecomposition(),g_vort.getGridInfo().getSize(),g);
+
+	// We calculate and print the maximum divergence of the vorticity
+	// and the integral of it
+	calc_and_print_max_div_and_int(g_vort);
+
+	// For each component solve a poisson
+	for (size_t i = 0 ; i < 3 ; i++)
+	{
+		//! \cond [solve_poisson_comp] \endcond
+
+		// Copy the vorticity component i in gr_ps
+		auto it2 = gr_ps.getDomainIterator();
+
+		// calculate the velocity from the curl of phi
+		while (it2.isNext())
+		{
+			auto key = it2.get();
+
+			// copy
+			gr_ps.get<vorticity>(key) = g_vort.template get<vorticity>(key)[i];
+
+			++it2;
+		}
+
+		// Maximum size of the stencil
+		Ghost<3,long int> stencil_max(2);
+
+		// Finite difference scheme
+		FDScheme<poisson_nn_helm> fd(stencil_max, domain, gr_ps);
+
+		// impose the poisson equation using gr_ps = vorticity for the right-hand-side (on the full domain)
+		fd.template impose_dit<phi>(poisson(),gr_ps,gr_ps.getDomainIterator());
+
+		solver.setAbsTol(0.01);
+
+		// Get the vector that represent the right-hand-side
+		Vector<double,PETSC_BASE> & b = fd.getB();
+
+		timer tm_solve;
+		tm_solve.start();
+
+		// Give to the solver A and b in this case we are giving
+		// an intitial guess phi_s calculated in the previous
+		// time step
+		solver.solve(phi_s[i],b);
+
+		tm_solve.stop();
+
+		// Calculate the residual
+
+		solError serr;
+		serr = solver.get_residual_error(phi_s[i],b);
+
+		Vcluster & v_cl = create_vcluster();
+		if (v_cl.getProcessUnitID() == 0)
+		{std::cout << "Solved component " << i << "  Error: " << serr.err_inf << std::endl;}
+
+		// copy the solution to grid
+		fd.template copy<phi>(phi_s[i],gr_ps);
+
+		//! \cond [solve_poisson_comp] \endcond
+
+		//! \cond [copy_to_phi_v] \endcond
+
+		auto it3 = gr_ps.getDomainIterator();
+
+		// calculate the velocity from the curl of phi
+		while (it3.isNext())
+		{
+			auto key = it3.get();
+
+			phi_v.get<velocity>(key)[i] = gr_ps.get<phi>(key);
+
+			++it3;
+		}
+
+		//! \cond [copy_to_phi_v] \endcond
+	}
+
+	//! \cond [curl_phi_v] \endcond
+
+	phi_v.ghost_get<phi>();
+
+	double inv_dx = 0.5f / g_vort.spacing(0);
+	double inv_dy = 0.5f / g_vort.spacing(1);
+	double inv_dz = 0.5f / g_vort.spacing(2);
+
+	auto it3 = phi_v.getDomainIterator();
+
+	// calculate the velocity from the curl of phi
+	while (it3.isNext())
+	{
+		auto key = it3.get();
+
+		double phi_xy = (phi_v.get<phi>(key.move(y,1))[x] - phi_v.get<phi>(key.move(y,-1))[x])*inv_dy;
+		double phi_xz = (phi_v.get<phi>(key.move(z,1))[x] - phi_v.get<phi>(key.move(z,-1))[x])*inv_dz;
+		double phi_yx = (phi_v.get<phi>(key.move(x,1))[y] - phi_v.get<phi>(key.move(x,-1))[y])*inv_dx;
+		double phi_yz = (phi_v.get<phi>(key.move(z,1))[y] - phi_v.get<phi>(key.move(z,-1))[y])*inv_dz;
+		double phi_zx = (phi_v.get<phi>(key.move(x,1))[z] - phi_v.get<phi>(key.move(x,-1))[z])*inv_dx;
+		double phi_zy = (phi_v.get<phi>(key.move(y,1))[z] - phi_v.get<phi>(key.move(y,-1))[z])*inv_dy;
+
+		g_vel.template get<velocity>(key)[x] = phi_zy - phi_yz;
+		g_vel.template get<velocity>(key)[y] = phi_xz - phi_zx;
+		g_vel.template get<velocity>(key)[z] = phi_yx - phi_xy;
+
+		++it3;
+	}
+
+	//! \cond [curl_phi_v] \endcond
+}
+
+
+template<unsigned int prp> void set_zero(grid_type & gr)
+{
+	auto it = gr.getDomainGhostIterator();
+
+	// calculate the velocity from the curl of phi
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		gr.template get<prp>(key)[0] = 0.0;
+		gr.template get<prp>(key)[1] = 0.0;
+		gr.template get<prp>(key)[2] = 0.0;
+
+		++it;
+	}
+}
+
+
+template<unsigned int prp> void set_zero(particles_type & vd)
+{
+	auto it = vd.getDomainIterator();
+
+	// calculate the velocity from the curl of phi
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		vd.template getProp<prp>(key)[0] = 0.0;
+		vd.template getProp<prp>(key)[1] = 0.0;
+		vd.template getProp<prp>(key)[2] = 0.0;
+
+		++it;
+	}
+}
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 6: Compute right hand side # {#vic_rhs_calc}
+ *
+ * Computing the right hand side is performed calculating the term
+ * \f$ (w \cdot \nabla) u \f$. For the nabla operator we use second
+ * order finite difference central scheme. The commented part is the
+ * term \f$ \nu \nabla^{2} w \f$ that we said to neglect
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp calc_rhs
+ *
+ */
+
+//! \cond [calc_rhs] \endcond
+
+// Calculate the right hand side of the vorticity formulation
+template<typename grid> void calc_rhs(grid & g_vort, grid & g_vel, grid & g_dwp)
+{
+	// usefull constant
+	constexpr int rhs = 0;
+
+	// calculate several pre-factors for the stencil finite
+	// difference
+	double fac1 = 2.0f*nu/(g_vort.spacing(0));
+	double fac2 = 2.0f*nu/(g_vort.spacing(1));
+	double fac3 = 2.0f*nu/(g_vort.spacing(2));
+
+	double fac4 = 0.5f/(g_vort.spacing(0));
+	double fac5 = 0.5f/(g_vort.spacing(1));
+	double fac6 = 0.5f/(g_vort.spacing(2));
+
+	auto it = g_dwp.getDomainIterator();
+
+	g_vort.template ghost_get<vorticity>();
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		g_dwp.template get<rhs>(key)[x] = /*fac1*(g_vort.template get<vorticity>(key.move(x,1))[x]+g_vort.template get<vorticity>(key.move(x,-1))[x])+
+				                          fac2*(g_vort.template get<vorticity>(key.move(y,1))[x]+g_vort.template get<vorticity>(key.move(y,-1))[x])+
+										  fac3*(g_vort.template get<vorticity>(key.move(z,1))[x]+g_vort.template get<vorticity>(key.move(z,-1))[x])-
+										  2.0f*(fac1+fac2+fac3)*g_vort.template get<vorticity>(key)[x]+*/
+										  fac4*g_vort.template get<vorticity>(key)[x]*
+										  (g_vel.template get<velocity>(key.move(x,1))[x] - g_vel.template get<velocity>(key.move(x,-1))[x])+
+										  fac5*g_vort.template get<vorticity>(key)[y]*
+										  (g_vel.template get<velocity>(key.move(y,1))[x] - g_vel.template get<velocity>(key.move(y,-1))[x])+
+										  fac6*g_vort.template get<vorticity>(key)[z]*
+										  (g_vel.template get<velocity>(key.move(z,1))[x] - g_vel.template get<velocity>(key.move(z,-1))[x]);
+
+		g_dwp.template get<rhs>(key)[y] = /*fac1*(g_vort.template get<vorticity>(key.move(x,1))[y]+g_vort.template get<vorticity>(key.move(x,-1))[y])+
+				                          fac2*(g_vort.template get<vorticity>(key.move(y,1))[y]+g_vort.template get<vorticity>(key.move(y,-1))[y])+
+										  fac3*(g_vort.template get<vorticity>(key.move(z,1))[y]+g_vort.template get<vorticity>(key.move(z,-1))[y])-
+										  2.0f*(fac1+fac2+fac3)*g_vort.template get<vorticity>(key)[y]+*/
+										  fac4*g_vort.template get<vorticity>(key)[x]*
+										  (g_vel.template get<velocity>(key.move(x,1))[y] - g_vel.template get<velocity>(key.move(x,-1))[y])+
+										  fac5*g_vort.template get<vorticity>(key)[y]*
+										  (g_vel.template get<velocity>(key.move(y,1))[y] - g_vel.template get<velocity>(key.move(y,-1))[y])+
+										  fac6*g_vort.template get<vorticity>(key)[z]*
+										  (g_vel.template get<velocity>(key.move(z,1))[y] - g_vel.template get<velocity>(key.move(z,-1))[y]);
+
+
+		g_dwp.template get<rhs>(key)[z] = /*fac1*(g_vort.template get<vorticity>(key.move(x,1))[z]+g_vort.template get<vorticity>(key.move(x,-1))[z])+
+				                          fac2*(g_vort.template get<vorticity>(key.move(y,1))[z]+g_vort.template get<vorticity>(key.move(y,-1))[z])+
+										  fac3*(g_vort.template get<vorticity>(key.move(z,1))[z]+g_vort.template get<vorticity>(key.move(z,-1))[z])-
+										  2.0f*(fac1+fac2+fac3)*g_vort.template get<vorticity>(key)[z]+*/
+										  fac4*g_vort.template get<vorticity>(key)[x]*
+										  (g_vel.template get<velocity>(key.move(x,1))[z] - g_vel.template get<velocity>(key.move(x,-1))[z])+
+										  fac5*g_vort.template get<vorticity>(key)[y]*
+										  (g_vel.template get<velocity>(key.move(y,1))[z] - g_vel.template get<velocity>(key.move(y,-1))[z])+
+										  fac6*g_vort.template get<vorticity>(key)[z]*
+										  (g_vel.template get<velocity>(key.move(z,1))[z] - g_vel.template get<velocity>(key.move(z,-1))[z]);
+
+		++it;
+	}
+}
+
+//! \cond [calc_rhs] \endcond
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 8: Runge-Kutta # {#vic_runge_kutta1}
+ *
+ * Here we do the first step of the runge kutta update. In particular we
+ * update the vorticity and position of the particles. The right-hand-side
+ * of the vorticity update is calculated on the grid and interpolated
+ *  on the particles. The Runge-Kutta of order two
+ * require the following update for the vorticity and position as first step
+ *
+ * \f$ \boldsymbol w = \boldsymbol w + \frac{1}{2} \boldsymbol {rhs} \delta t \f$
+ *
+ * \f$ \boldsymbol x = \boldsymbol x + \frac{1}{2} \boldsymbol u \delta t \f$
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp runge_kutta_1
+ *
+ */
+
+//! \cond [runge_kutta_1] \endcond
+
+void rk_step1(particles_type & particles)
+{
+	auto it = particles.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		// Save the old vorticity
+		particles.getProp<old_vort>(key)[x] = particles.getProp<vorticity>(key)[x];
+		particles.getProp<old_vort>(key)[y] = particles.getProp<vorticity>(key)[y];
+		particles.getProp<old_vort>(key)[z] = particles.getProp<vorticity>(key)[z];
+
+		particles.getProp<vorticity>(key)[x] = particles.getProp<vorticity>(key)[x] + 0.5f * dt * particles.getProp<rhs_part>(key)[x];
+		particles.getProp<vorticity>(key)[y] = particles.getProp<vorticity>(key)[y] + 0.5f * dt * particles.getProp<rhs_part>(key)[y];
+		particles.getProp<vorticity>(key)[z] = particles.getProp<vorticity>(key)[z] + 0.5f * dt * particles.getProp<rhs_part>(key)[z];
+
+		++it;
+	}
+
+	auto it2 = particles.getDomainIterator();
+
+	while (it2.isNext())
+	{
+		auto key = it2.get();
+
+		// Save the old position
+		particles.getProp<old_pos>(key)[x] = particles.getPos(key)[x];
+		particles.getProp<old_pos>(key)[y] = particles.getPos(key)[y];
+		particles.getProp<old_pos>(key)[z] = particles.getPos(key)[z];
+
+		particles.getPos(key)[x] = particles.getPos(key)[x] + 0.5f * dt * particles.getProp<p_vel>(key)[x];
+		particles.getPos(key)[y] = particles.getPos(key)[y] + 0.5f * dt * particles.getProp<p_vel>(key)[y];
+		particles.getPos(key)[z] = particles.getPos(key)[z] + 0.5f * dt * particles.getProp<p_vel>(key)[z];
+
+		++it2;
+	}
+
+	particles.map();
+}
+
+//! \cond [runge_kutta_1] \endcond
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 13: Runge-Kutta # {#vic_runge_kutta2}
+ *
+ * Here we do the second step of the Runge-Kutta update. In particular we
+ * update the vorticity and position of the particles. The right-hand-side
+ * of the vorticity update is calculated on the grid and interpolated
+ *  on the particles. The Runge-Kutta of order two
+ * require the following update for the vorticity and position as first step
+ *
+ * \f$ \boldsymbol w = \boldsymbol w + \frac{1}{2} \boldsymbol {rhs} \delta t \f$
+ *
+ * \f$ \boldsymbol x = \boldsymbol x + \frac{1}{2} \boldsymbol u \delta t \f$
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp runge_kutta_2
+ *
+ */
+
+//! \cond [runge_kutta_2] \endcond
+
+void rk_step2(particles_type & particles)
+{
+	auto it = particles.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		particles.getProp<vorticity>(key)[x] = particles.getProp<old_vort>(key)[x] + dt * particles.getProp<rhs_part>(key)[x];
+		particles.getProp<vorticity>(key)[y] = particles.getProp<old_vort>(key)[y] + dt * particles.getProp<rhs_part>(key)[y];
+		particles.getProp<vorticity>(key)[z] = particles.getProp<old_vort>(key)[z] + dt * particles.getProp<rhs_part>(key)[z];
+
+		++it;
+	}
+
+	auto it2 = particles.getDomainIterator();
+
+	while (it2.isNext())
+	{
+		auto key = it2.get();
+
+		particles.getPos(key)[x] = particles.getProp<old_pos>(key)[x] + dt * particles.getProp<p_vel>(key)[x];
+		particles.getPos(key)[y] = particles.getProp<old_pos>(key)[y] + dt * particles.getProp<p_vel>(key)[y];
+		particles.getPos(key)[z] = particles.getProp<old_pos>(key)[z] + dt * particles.getProp<p_vel>(key)[z];
+
+		++it2;
+	}
+
+	particles.map();
+}
+
+
+
+//! \cond [runge_kutta_2] \endcond
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Step 4-5-6-7: Do step # {#vic_do_step}
+ *
+ * The do step function assemble multiple steps some of them already explained.
+ * First we interpolate the vorticity from particles to mesh
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp do_step_p2m
+ *
+ * than we calculate velocity out of vorticity and the right-hand-side
+ * recalling step 5 and 6
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp step_56
+ *
+ * Finally we interpolate velocity and right-hand-side back to particles
+ *
+ * \snippet Numerics/Vortex_in_cell/main_vic_petsc.cpp inte_m2p
+ *
+ */
+
+template<typename grid, typename vector> void do_step(vector & particles,
+		                                              grid & g_vort,
+													  grid & g_vel,
+													  grid & g_dvort,
+													  Box<3,double> & domain,
+													  interpolate<particles_type,grid_type,mp4_kernel<double>> & inte,
+													  petsc_solver<double>::return_type (& phi_s)[3],
+													  petsc_solver<double> & solver)
+{
+	constexpr int rhs = 0;
+
+	//! \cond [do_step_p2m] \endcond
+
+	set_zero<vorticity>(g_vort);
+	inte.template p2m<vorticity,vorticity>(particles,g_vort);
+
+	g_vort.template ghost_put<add_,vorticity>();
+
+	//! \cond [do_step_p2m] \endcond
+
+	//! \cond [step_56] \endcond
+
+	// Calculate velocity from vorticity
+	comp_vel(domain,g_vort,g_vel,phi_s,solver);
+	calc_rhs(g_vort,g_vel,g_dvort);
+
+	//! \cond [step_56] \endcond
+
+	//! \cond [inte_m2p] \endcond
+
+	g_dvort.template ghost_get<rhs>();
+	g_vel.template ghost_get<velocity>();
+	set_zero<rhs_part>(particles);
+	set_zero<p_vel>(particles);
+	inte.template m2p<rhs,rhs_part>(g_dvort,particles);
+	inte.template m2p<velocity,p_vel>(g_vel,particles);
+
+	//! \cond [inte_m2p] \endcond
+}
+
+template<typename vector, typename grid> void check_point_and_save(vector & particles,
+																   grid & g_vort,
+																   grid & g_vel,
+																   grid & g_dvort,
+																   size_t i)
+{
+	Vcluster & v_cl = create_vcluster();
+
+	if (v_cl.getProcessingUnits() < 24)
+	{
+		particles.write("part_out_" + std::to_string(i),VTK_WRITER | FORMAT_BINARY);
+		g_vort.template ghost_get<vorticity>();
+		g_vel.template ghost_get<velocity>();
+		g_vel.write("grid_velocity_" + std::to_string(i), VTK_WRITER | FORMAT_BINARY);
+		g_vort.write("grid_vorticity_" + std::to_string(i), VTK_WRITER | FORMAT_BINARY);
+	}
+
+	// In order to reduce the size of the saved data we apply a threshold.
+	// We only save particles with vorticity higher than 0.1
+	vector_dist<3,double,aggregate<double>> part_save(particles.getDecomposition(),0);
+
+	auto it_s = particles.getDomainIterator();
+
+	while (it_s.isNext())
+	{
+		auto p = it_s.get();
+
+		double vort_magn = sqrt(particles.template getProp<vorticity>(p)[0] * particles.template getProp<vorticity>(p)[0] +
+							   particles.template getProp<vorticity>(p)[1] * particles.template getProp<vorticity>(p)[1] +
+							   particles.template getProp<vorticity>(p)[2] * particles.template getProp<vorticity>(p)[2]);
+
+		if (vort_magn < 0.1)
+		{
+			++it_s;
+			continue;
+		}
+
+		part_save.add();
+
+		part_save.getLastPos()[0] = particles.getPos(p)[0];
+		part_save.getLastPos()[1] = particles.getPos(p)[1];
+		part_save.getLastPos()[2] = particles.getPos(p)[2];
+
+		part_save.template getLastProp<0>() = vort_magn;
+
+		++it_s;
+	}
+
+	part_save.map();
+
+	// Save and HDF5 file for checkpoint restart
+	particles.save("check_point");
+}
+
+/*! \page Vortex_in_cell_petsc Vortex in Cell 3D
+ *
+ * # Main # {#vic_main}
+ *
+ * The main function as usual call the function **openfpm_init** it define
+ * the domain where the simulation take place. The ghost size of the grid
+ * the size of the grid in grid units on each direction, the periodicity
+ * of the domain, in this case PERIODIC in each dimension and we create
+ * our basic data structure for the simulation. A grid for the vorticity
+ * **g_vort** a grid for the velocity **g_vel** a grid for the right-hand-side
+ * of the vorticity update, and the **particles** vector. Additionally
+ * we define the data structure **phi_s[3]** that store the velocity solution
+ * in the previous time-step. keeping track of the previous solution for
+ * the velocity help the interative-solver to find the solution more quickly.
+ * Using the old velocity configuration as initial guess the solver will
+ * converge in few iterations refining the old one.
+ *
+ */
+int main(int argc, char* argv[])
+{
+	// Initialize
+	openfpm_init(&argc,&argv);
+	{
+	// Domain, a rectangle
+	Box<3,double> domain({0.0,0.0,0.0},{22.0,5.57,5.57});
+
+	// Ghost (Not important in this case but required)
+	Ghost<3,long int> g(2);
+
+	// Grid points on x=128 y=64 z=64
+	long int sz[] = {512,64,64};
+	size_t szu[] = {(size_t)sz[0],(size_t)sz[1],(size_t)sz[2]};
+
+	periodicity<3> bc = {{PERIODIC,PERIODIC,PERIODIC}};
+
+	grid_type g_vort(szu,domain,g,bc);
+	grid_type g_vel(g_vort.getDecomposition(),szu,g);
+	grid_type g_dvort(g_vort.getDecomposition(),szu,g);
+	particles_type particles(g_vort.getDecomposition(),0);
+
+	// It store the solution to compute velocity
+	// It is used as initial guess every time we call the solver
+	Vector<double,PETSC_BASE> phi_s[3];
+	Vector<double,PETSC_BASE> x_;
+
+	// Parallel object
+	Vcluster & v_cl = create_vcluster();
+
+	// print out the spacing of the grid
+	if (v_cl.getProcessUnitID() == 0)
+	{std::cout << "SPACING " << g_vort.spacing(0) << " " << g_vort.spacing(1) << " " << g_vort.spacing(2) << std::endl;}
+
+	// initialize the ring step 1
+	init_ring(g_vort,domain);
+
+	x_.resize(g_vort.size(),g_vort.getLocalDomainSize());
+	x_.setZero();
+
+	// Create a PETSC solver to get the solution x
+	petsc_solver<double> solver;
+
+	// Do the helmotz projection step 2
+	helmotz_hodge_projection(g_vort,domain,solver,x_,true);
+
+	// initialize the particle on a mesh step 3
+	remesh(particles,g_vort,domain);
+
+	// Get the total number of particles
+	size_t tot_part = particles.size_local();
+	v_cl.sum(tot_part);
+	v_cl.execute();
+
+	// Now we set the size of phi_s
+	phi_s[0].resize(g_vort.size(),g_vort.getLocalDomainSize());
+	phi_s[1].resize(g_vort.size(),g_vort.getLocalDomainSize());
+	phi_s[2].resize(g_vort.size(),g_vort.getLocalDomainSize());
+
+	// And we initially set it to zero
+	phi_s[0].setZero();
+	phi_s[1].setZero();
+	phi_s[2].setZero();
+
+	// We create the interpolation object outside the loop cycles
+	// to avoid to recreate it at every cycle. Internally this object
+	// create some data-structure to optimize the conversion particle
+	// position to sub-domain. If there are no re-balancing it is safe
+	// to reuse-it
+	interpolate<particles_type,grid_type,mp4_kernel<double>> inte(particles,g_vort);
+
+	// With more than 24 core we rely on the HDF5 checkpoint restart file
+	if (v_cl.getProcessingUnits() < 24)
+	{g_vort.write("grid_vorticity_init", VTK_WRITER | FORMAT_BINARY);}
+
+
+	// Before entering the loop we check if we want to restart from
+	// a previous simulation
+	size_t i = 0;
+
+	// if we have an argument
+	if (argc > 1)
+	{
+		// take that argument
+		std::string restart(argv[1]);
+
+		// convert it into number
+		i = std::stoi(restart);
+
+		// load the file
+		particles.load("check_point_" + std::to_string(i));
+
+		// Print to inform that we are restarting from a
+		// particular position
+		if (v_cl.getProcessUnitID() == 0)
+		{std::cout << "Restarting from " << i << std::endl;}
+	}
+
+	// Time Integration
+	for ( ; i < 10001 ; i++)
+	{
+		// do step 4-5-6-7
+		do_step(particles,g_vort,g_vel,g_dvort,domain,inte,phi_s,solver);
+
+		// do step 8
+		rk_step1(particles);
+
+		// do step 9-10-11-12
+		do_step(particles,g_vort,g_vel,g_dvort,domain,inte,phi_s,solver);
+
+		// do step 13
+		rk_step2(particles);
+
+		// so step 14
+		set_zero<vorticity>(g_vort);
+		inte.template p2m<vorticity,vorticity>(particles,g_vort);
+		g_vort.template ghost_put<add_,vorticity>();
+
+		// helmotz-hodge projection
+		helmotz_hodge_projection(g_vort,domain,solver,x_,false);
+
+		remesh(particles,g_vort,domain);
+
+		// print the step number
+		if (v_cl.getProcessUnitID() == 0)
+		{std::cout << "Step " << i << std::endl;}
+
+		// every 100 steps write the output
+		if (i % 100 == 0)		{check_point_and_save(particles,g_vort,g_vel,g_dvort,i);}
+
+	}
+	}
+
+	openfpm_finalize();
+}
+
diff --git a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
new file mode 100644
index 00000000..d49c96ad
--- /dev/null
+++ b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
@@ -0,0 +1,1254 @@
+/*!
+ * \page Vector_7_sph_dlb_dbg Vector 7 SPH Dam break simulation (Debugging video)
+ *
+ *
+ * [TOC]
+ *
+ *
+ * # SPH with Dynamic load Balancing (Debugging video) # {#SPH_dlb}
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/7_SPH_dlb/dam_break_all.jpg"/>
+ * \endhtmlonly
+ *
+ * ## Inclusion ## {#e7_sph_inclusion}
+ *
+ *
+ * \snippet Vector/7_SPH_dlb/main.cpp inclusion
+ *
+ */
+
+//#define SE_CLASS1
+//#define STOP_ON_ERROR
+
+#include "Vector/vector_dist.hpp"
+#include <math.h>
+#include "Draw/DrawParticles.hpp"
+
+
+// A constant to indicate boundary particles
+#define BOUNDARY 0
+
+// A constant to indicate fluid particles
+#define FLUID 1
+
+size_t cnt = 0;
+
+// initial spacing between particles dp in the formulas
+const double dp = 0.0085;
+// Maximum height of the fluid water
+// is going to be calculated and filled later on
+double h_swl = 0.0;
+
+// c_s in the formulas (constant used to calculate the sound speed)
+const double coeff_sound = 20.0;
+
+// gamma in the formulas
+const double gamma_ = 7.0;
+
+// sqrt(3.0*dp*dp) support of the kernel
+const double H = 0.0147224318643;
+
+// Eta in the formulas
+const double Eta2 = 0.01 * H*H;
+
+// alpha in the formula
+const double visco = 0.1;
+
+// cbar in the formula (calculated later)
+double cbar = 0.0;
+
+// Mass of the fluid particles
+const double MassFluid = 0.000614125;
+
+// Mass of the boundary particles
+const double MassBound = 0.000614125;
+
+// End simulation time
+const double t_end = 1.5;
+
+// Gravity acceleration
+const double gravity = 9.81;
+
+// Reference densitu 1000Kg/m^3
+const double rho_zero = 1000.0;
+
+// Filled later require h_swl, it is b in the formulas
+double B = 0.0;
+
+// Constant used to define time integration
+const double CFLnumber = 0.2;
+
+// Minimum T
+const double DtMin = 0.00001;
+
+// Minimum Rho allowed
+const double RhoMin = 700.0;
+
+// Maximum Rho allowed
+const double RhoMax = 1300.0;
+
+// Filled in initialization
+double max_fluid_height = 0.0;
+
+// Properties
+
+// FLUID or BOUNDARY
+const size_t type = 0;
+
+// Density
+const int rho = 1;
+
+// Density at step n-1
+const int rho_prev = 2;
+
+// Pressure
+const int Pressure = 3;
+
+// Delta rho calculated in the force calculation
+const int drho = 4;
+
+// calculated force
+const int force = 5;
+
+// velocity
+const int velocity = 6;
+
+// velocity at previous step
+const int velocity_prev = 7;
+
+struct nb_f
+{
+	size_t id;
+	double fact;
+
+	// Used to reorder the neighborhood particles by id
+	bool operator<(const struct nb_f & pag) const
+	{
+		return (id < pag.id);
+	}
+};
+
+// Type of the vector containing particles
+typedef vector_dist<3,double,aggregate<size_t,double,  double,    double,     double,     double[3], double[3], double[3],openfpm::vector<nb_f>, openfpm::vector<nb_f>, double[3], double , size_t>> particles;
+//                                       |      |        |          |            |            |         |            |
+//                                       |      |        |          |            |            |         |            |
+//                                     type   density   density    Pressure    delta       force     velocity    velocity
+//                                                      at n-1                 density                           at n - 1
+
+struct ModelCustom
+{
+	template<typename Decomposition, typename vector> inline void addComputation(Decomposition & dec, const vector & vd, size_t v, size_t p)
+	{
+		if (vd.template getProp<type>(p) == FLUID)
+			dec.addComputationCost(v,4);
+		else
+			dec.addComputationCost(v,3);
+	}
+
+	template<typename Decomposition> inline void applyModel(Decomposition & dec, size_t v)
+	{
+		dec.setSubSubDomainComputationCost(v, dec.getSubSubDomainComputationCost(v) * dec.getSubSubDomainComputationCost(v));
+	}
+};
+
+inline void EqState(particles & vd)
+{
+	auto it = vd.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto a = it.get();
+
+		double rho_a = vd.template getProp<rho>(a);
+		double rho_frac = rho_a / rho_zero;
+
+		vd.template getProp<Pressure>(a) = B*( rho_frac*rho_frac*rho_frac*rho_frac*rho_frac*rho_frac*rho_frac - 1.0);
+
+		++it;
+	}
+}
+
+
+
+const double a2 = 1.0/M_PI/H/H/H;
+
+inline double Wab(double r)
+{
+	r /= H;
+
+	if (r < 1.0)
+		return (1.0 - 3.0/2.0*r*r + 3.0/4.0*r*r*r)*a2;
+	else if (r < 2.0)
+		return (1.0/4.0*(2.0 - r*r)*(2.0 - r*r)*(2.0 - r*r))*a2;
+	else
+		return 0.0;
+}
+
+
+
+const double c1 = -3.0/M_PI/H/H/H/H;
+const double d1 = 9.0/4.0/M_PI/H/H/H/H;
+const double c2 = -3.0/4.0/M_PI/H/H/H/H;
+const double a2_4 = 0.25*a2;
+// Filled later
+double W_dap = 0.0;
+
+inline void DWab(Point<3,double> & dx, Point<3,double> & DW, double r, bool print)
+{
+	const double qq=r/H;
+
+	if (qq < 1.0)
+	{
+		double qq2 = qq * qq;
+		double fac = (c1*qq + d1*qq2)/r;
+
+		DW.get(0) = fac*dx.get(0);
+		DW.get(1) = fac*dx.get(1);
+		DW.get(2) = fac*dx.get(2);
+	}
+	else if (qq < 2.0)
+	{
+		double wqq = (2.0 - qq);
+		double fac = c2 * wqq * wqq / r;
+
+		DW.get(0) = fac * dx.get(0);
+		DW.get(1) = fac * dx.get(1);
+		DW.get(2) = fac * dx.get(2);
+	}
+	else
+	{
+		DW.get(0) = 0.0;
+		DW.get(1) = 0.0;
+		DW.get(2) = 0.0;
+	}
+}
+
+
+// Tensile correction
+inline double Tensile(double r, double rhoa, double rhob, double prs1, double prs2)
+{
+	const double qq=r/H;
+	//-Cubic Spline kernel
+	double wab;
+	if(r>H)
+	{
+		double wqq1=2.0f-qq;
+		double wqq2=wqq1*wqq1;
+
+		wab=a2_4*(wqq2*wqq1);
+	}
+	else
+	{
+	    double wqq2=qq*qq;
+	    double wqq3=wqq2*qq;
+
+	    wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
+	}
+
+	//-Tensile correction.
+	double fab=wab*W_dap;
+	fab*=fab; fab*=fab; //fab=fab^4
+	const double tensilp1=(prs1/(rhoa*rhoa))*(prs1>0? 0.01: -0.2);
+	const double tensilp2=(prs2/(rhob*rhob))*(prs2>0? 0.01: -0.2);
+
+	return (fab*(tensilp1+tensilp2));
+}
+
+
+inline double Pi(const Point<3,double> & dr, double rr2, Point<3,double> & dv, double rhoa, double rhob, double massb, double & visc)
+{
+	const double dot = dr.get(0)*dv.get(0) + dr.get(1)*dv.get(1) + dr.get(2)*dv.get(2);
+	const double dot_rr2 = dot/(rr2+Eta2);
+	visc=std::max(dot_rr2,visc);
+
+	if(dot < 0)
+	{
+		const float amubar=H*dot_rr2;
+		const float robar=(rhoa+rhob)*0.5f;
+		const float pi_visc=(-visco*cbar*amubar/robar);
+
+		return pi_visc;
+    }
+	else
+		return 0.0;
+}
+
+
+template<typename VerletList> inline double calc_forces(particles & vd, VerletList & NN, double & max_visc, double r_gskin)
+{
+	auto NN2 = vd.getCellList(r_gskin);
+
+	// Reset the ghost
+    auto itg = vd.getDomainAndGhostIterator();
+    while (itg.isNext())
+    {
+        auto p = itg.get();
+        // Reset force
+
+        if (p.getKey() < vd.size_local())
+        {
+			// Reset the force counter (- gravity on zeta direction)
+			vd.template getProp<force>(p)[0] = 0.0;
+			vd.template getProp<force>(p)[1] = 0.0;
+			vd.template getProp<force>(p)[2] = -gravity;
+			vd.template getProp<drho>(p) = 0.0;
+
+
+			// Reset the force counter (- gravity on zeta direction)
+			vd.template getProp<10>(p)[0] = 0.0;
+			vd.template getProp<10>(p)[1] = 0.0;
+			vd.template getProp<10>(p)[2] = -gravity;
+			vd.template getProp<11>(p) = 0.0;
+        }
+        else
+        {
+			vd.getProp<force>(p)[0] = 0.0;
+			vd.getProp<force>(p)[1] = 0.0;
+			vd.getProp<force>(p)[2] = 0.0;
+
+			vd.getProp<drho>(p) = 0.0;
+        }
+
+
+        vd.getProp<8>(p).clear();
+        vd.getProp<9>(p).clear();
+
+        ++itg;
+    }
+
+    // Get an iterator over particles
+    auto part = vd.getParticleIteratorCRS(NN.getInternalCellList());
+
+	double visc = 0;
+
+	// For each particle ...
+	while (part.isNext())
+	{
+		// ... a
+		auto a = part.get();
+
+		// Get the position xp of the particle
+		Point<3,double> xa = vd.getPos(a);
+
+		// Take the mass of the particle dependently if it is FLUID or BOUNDARY
+		double massa = (vd.getProp<type>(a) == FLUID)?MassFluid:MassBound;
+
+		// Get the density of the of the particle a
+		double rhoa = vd.getProp<rho>(a);
+
+		// Get the pressure of the particle a
+		double Pa = vd.getProp<Pressure>(a);
+
+		// Get the Velocity of the particle a
+		Point<3,double> va = vd.getProp<velocity>(a);
+
+		// We threat FLUID particle differently from BOUNDARY PARTICLES ...
+		if (vd.getProp<type>(a) != FLUID)
+		{
+			// If it is a boundary particle calculate the delta rho based on equation 2
+			// This require to run across the neighborhoods particles of a
+			auto Np = NN.template getNNIterator<NO_CHECK>(a);
+
+			// For each neighborhood particle
+			while (Np.isNext() == true)
+			{
+				// ... q
+				auto b = Np.get();
+
+				// Get the position xp of the particle
+				Point<3,double> xb = vd.getPos(b);
+
+				// if (p == q) skip this particle
+				if (a == b)	{++Np; continue;};
+
+				// get the mass of the particle
+				double massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
+
+				// Get the velocity of the particle b
+				Point<3,double> vb = vd.getProp<velocity>(b);
+
+				// Get the pressure and density of particle b
+				double Pb = vd.getProp<Pressure>(b);
+				double rhob = vd.getProp<rho>(b);
+
+				// Get the distance between p and q
+				Point<3,double> dr = xa - xb;
+				// take the norm of this vector
+				double r2 = norm2(dr);
+
+				// If the particles interact ...
+				if (r2 < 4.0*H*H)
+				{
+					// ... calculate delta rho
+					double r = sqrt(r2);
+
+					Point<3,double> dv = va - vb;
+
+					Point<3,double> DW;
+					DWab(dr,DW,r,false);
+
+					const double dot = dr.get(0)*dv.get(0) + dr.get(1)*dv.get(1) + dr.get(2)*dv.get(2);
+					const double dot_rr2 = dot/(r2+Eta2);
+					max_visc=std::max(dot_rr2,max_visc);
+
+					double scal = (dv.get(0)*DW.get(0)+dv.get(1)*DW.get(1)+dv.get(2)*DW.get(2));
+
+					vd.getProp<drho>(a) += massb*scal;
+					vd.getProp<drho>(b) += massa*scal;
+
+					struct nb_f nna;
+					nna.id = b;
+					nna.fact = 0.0;
+
+					struct nb_f nnb;
+
+					// If it is a fluid we have to calculate force
+					if (vd.getProp<type>(b) == FLUID)
+					{
+						Point<3,double> v_rel = va - vb;
+						double factor = - ((vd.getProp<Pressure>(a) + vd.getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb) + Pi(dr,r2,v_rel,rhoa,rhob,massb,visc));
+
+						vd.getProp<force>(b)[0] -= massa * factor * DW.get(0);
+						vd.getProp<force>(b)[1] -= massa * factor * DW.get(1);
+						vd.getProp<force>(b)[2] -= massa * factor * DW.get(2);
+
+						nnb.id = vd.getProp<12>(a);
+						nnb.fact = -massa * factor * DW.get(2);
+					}
+					else
+					{
+						struct nb_f nnb;
+						nnb.id = vd.getProp<12>(a);
+						nnb.fact = 0.0;
+					}
+
+					vd.getProp<9>(a).add(nna);
+					vd.getProp<9>(b).add(nnb);
+
+				}
+
+				++Np;
+			}
+
+			////////////////////// NN2
+
+			// Get an iterator over the neighborhood particles of p
+			auto Np2 = NN2.template getNNIterator<NO_CHECK>(NN2.getCell(vd.getPos(a)));
+
+			// For each neighborhood particle
+			while (Np2.isNext() == true)
+			{
+				// ... q
+				auto b = Np2.get();
+
+				// Get the position xp of the particle
+				Point<3,double> xb = vd.getPos(b);
+
+				// if (p == q) skip this particle
+				if (a == b)	{++Np2; continue;};
+
+				// get the mass of the particle
+				double massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
+
+				// Get the velocity of the particle b
+				Point<3,double> vb = vd.getProp<velocity>(b);
+
+				// Get the pressure and density of particle b
+				double Pb = vd.getProp<Pressure>(b);
+				double rhob = vd.getProp<rho>(b);
+
+				// Get the distance between p and q
+				Point<3,double> dr = xa - xb;
+				// take the norm of this vector
+				double r2 = norm2(dr);
+
+				// If the particles interact ...
+				if (r2 < 4.0*H*H)
+				{
+					// ... calculate delta rho
+					double r = sqrt(r2);
+
+					Point<3,double> dv = va - vb;
+
+					Point<3,double> DW;
+					DWab(dr,DW,r,false);
+
+					const double dot = dr.get(0)*dv.get(0) + dr.get(1)*dv.get(1) + dr.get(2)*dv.get(2);
+					const double dot_rr2 = dot/(r2+Eta2);
+
+					vd.getProp<11>(a) += massb*(dv.get(0)*DW.get(0)+dv.get(1)*DW.get(1)+dv.get(2)*DW.get(2));
+
+
+					struct nb_f nna;
+					nna.id = vd.getProp<12>(b);
+					nna.fact = 0.0;
+
+					vd.getProp<8>(a).add(nna);
+				}
+
+				++Np2;
+			}
+
+		}
+		else
+		{
+			// If it is a fluid particle calculate based on equation 1 and 2
+
+			// Get an iterator over the neighborhood particles of p
+			auto Np = NN.template getNNIterator<NO_CHECK>(a);
+
+			// For each neighborhood particle
+			while (Np.isNext() == true)
+			{
+				// ... q
+				auto b = Np.get();
+
+				// Get the position xp of the particle
+				Point<3,double> xb = vd.getPos(b);
+
+				// if (p == q) skip this particle
+				if (a == b)	{++Np; continue;};
+
+				double massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
+				Point<3,double> vb = vd.getProp<velocity>(b);
+				double Pb = vd.getProp<Pressure>(b);
+				double rhob = vd.getProp<rho>(b);
+
+				// Get the distance between p and q
+				Point<3,double> dr = xa - xb;
+				// take the norm of this vector
+				double r2 = norm2(dr);
+
+				// if they interact
+				if (r2 < 4.0*H*H)
+				{
+					double r = sqrt(r2);
+
+					Point<3,double> v_rel = va - vb;
+
+					Point<3,double> DW;
+					DWab(dr,DW,r,false);
+
+					double factor = - ((vd.getProp<Pressure>(a) + vd.getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb) + Pi(dr,r2,v_rel,rhoa,rhob,massb,visc));
+
+					vd.getProp<force>(a)[0] += massb * factor * DW.get(0);
+					vd.getProp<force>(a)[1] += massb * factor * DW.get(1);
+					vd.getProp<force>(a)[2] += massb * factor * DW.get(2);
+
+					vd.getProp<force>(b)[0] -= massa * factor * DW.get(0);
+					vd.getProp<force>(b)[1] -= massa * factor * DW.get(1);
+					vd.getProp<force>(b)[2] -= massa * factor * DW.get(2);
+
+					double scal = (v_rel.get(0)*DW.get(0)+v_rel.get(1)*DW.get(1)+v_rel.get(2)*DW.get(2));
+
+					vd.getProp<drho>(a) += massb*scal;
+					vd.getProp<drho>(b) += massa*scal;
+
+
+					if (vd.getProp<12>(a) == 15604 && vd.getProp<12>(b) == 15601)
+					{
+						std::cout << "DETECTED ERROR "  << std::endl;
+					}
+
+					if (vd.getProp<12>(b) == 15604 && vd.getProp<12>(a) == 15601)
+					{
+						std::cout << "DETECTED ERROR " << create_vcluster().getProcessUnitID() << "   a " << a << "  b " << b << "    " << vd.size_local_with_ghost()  << std::endl;
+					}
+
+
+					struct nb_f nna;
+					nna.id = vd.getProp<12>(b);
+					nna.fact = massb * factor * DW.get(2);
+
+					struct nb_f nnb;
+					nnb.id = vd.getProp<12>(a);
+					nnb.fact = -massa * factor * DW.get(2);
+
+					vd.getProp<9>(a).add(nna);
+					vd.getProp<9>(b).add(nnb);
+				}
+
+				++Np;
+			}
+
+
+			///////////////////// NN2
+
+
+			////////////////////// NN2
+
+			// Get an iterator over the neighborhood particles of p
+			auto Np2 = NN2.template getNNIterator<NO_CHECK>(NN2.getCell(vd.getPos(a)));
+
+			if (a == 0 && create_vcluster().getProcessUnitID() == 0)
+			{
+//				std::cout << "Particle 0" << std::endl;
+			}
+
+			// For each neighborhood particle
+			while (Np2.isNext() == true)
+			{
+				// ... q
+				auto b = Np2.get();
+
+				// Get the position xp of the particle
+				Point<3,double> xb = vd.getPos(b);
+
+				// if (p == q) skip this particle
+				if (a == b)	{++Np2; continue;};
+
+				double massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound;
+				Point<3,double> vb = vd.getProp<velocity>(b);
+				double Pb = vd.getProp<Pressure>(b);
+				double rhob = vd.getProp<rho>(b);
+
+				// Get the distance between p and q
+				Point<3,double> dr = xa - xb;
+				// take the norm of this vector
+				double r2 = norm2(dr);
+
+				// if they interact
+				if (r2 < 4.0*H*H)
+				{
+					double r = sqrt(r2);
+
+					Point<3,double> v_rel = va - vb;
+
+					Point<3,double> DW;
+					DWab(dr,DW,r,false);
+
+					double factor = - massb*((vd.getProp<Pressure>(a) + vd.getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb) + Pi(dr,r2,v_rel,rhoa,rhob,massb,visc));
+
+					vd.getProp<10>(a)[0] += factor * DW.get(0);
+					vd.getProp<10>(a)[1] += factor * DW.get(1);
+					vd.getProp<10>(a)[2] += factor * DW.get(2);
+
+					vd.getProp<11>(a) += massb*(v_rel.get(0)*DW.get(0)+v_rel.get(1)*DW.get(1)+v_rel.get(2)*DW.get(2));
+
+					struct nb_f nna;
+					nna.id = vd.getProp<12>(b);
+					nna.fact = factor * DW.get(2);
+
+					vd.getProp<8>(a).add(nna);
+				}
+
+				++Np2;
+			}
+		}
+
+		++part;
+	}
+
+	vd.template ghost_put<add_,drho>();
+	vd.template ghost_put<add_,force>();
+	vd.template ghost_put<merge_,9>();
+	vd.template ghost_put<add_,10>();
+	vd.template ghost_put<add_,11>();
+
+	auto part3 = vd.getDomainIterator();
+
+	while (part3.isNext())
+	{
+		auto a = part3.get();
+
+		if (vd.getProp<force>(a)[0] != vd.getProp<10>(a)[0] ||
+			vd.getProp<force>(a)[1] != vd.getProp<10>(a)[1] ||
+			vd.getProp<force>(a)[2] != vd.getProp<10>(a)[2])
+		{
+			if (create_vcluster().getProcessUnitID() == 0)
+			{
+				std::cout << "LISTS " << vd.getProp<12>(a) << "    "  <<  vd.getProp<8>(a).size() << "    " << vd.getProp<9>(a).size() << std::endl;
+				std::cout << "ERROR: " << vd.getProp<force>(a)[2] << "   " << vd.getProp<10>(a)[2] << "   part: " << a.getKey() << std::endl;
+
+				// Print factors and sum
+
+
+				vd.getProp<8>(a).sort();
+				vd.getProp<9>(a).sort();
+
+				double sum1 = 0.0;
+				double sum2 = 0.0;
+
+				for (size_t i = 0 ; i  < vd.getProp<8>(a).size() ; i++)
+				{
+
+					std::cout << "FACT: " << vd.getProp<8>(a).get(i).fact << "  " << vd.getProp<9>(a).get(i).fact << "   " << vd.getProp<8>(a).get(i).id << "   " << vd.getProp<9>(a).get(i).id << std::endl;
+
+					sum1 += vd.getProp<8>(a).get(i).fact;
+					sum2 += vd.getProp<9>(a).get(i).fact;
+				}
+
+				std::cout << "sum: " << sum1 << " " << sum2 << std::endl;
+			}
+
+			break;
+		}
+
+		vd.getProp<8>(a).sort();
+		vd.getProp<9>(a).sort();
+
+		if (create_vcluster().getProcessUnitID() == 0)
+		{
+			if (vd.getProp<8>(a).size() != vd.getProp<9>(a).size())
+			{
+				std::cout << "ERROR: " <<  vd.getProp<12>(a) << "    " << vd.getProp<8>(a).size() << "   "  << vd.getProp<9>(a).size()  << std::endl;
+
+				for (size_t i = 0 ; i < vd.getProp<8>(a).size() ; i++)
+				{
+					std::cout << "NNNN:  " << vd.getProp<9>(a).get(i).id << "    " << vd.getProp<8>(a).get(i).id << std::endl;
+				}
+
+				break;
+			}
+		}
+
+//		double sum1 = 0.0;
+//		double sum2 = 0.0;
+
+/*		for (size_t i = 0 ; i  < vd.getProp<8>(a).size() ; i++)
+		{
+			if (vd.getProp<8>(a).get(i).id != vd.getProp<9>(a).get(i).id)
+			{
+				std::cout << "CAZZO " << std::endl;
+			}
+
+
+			sum1 += vd.getProp<8>(a).get(i).fact;
+			sum2 += vd.getProp<9>(a).get(i).fact;
+		}*/
+
+/*		if (sum1 != sum2)
+		{
+			std::cout << "PORCA TROIA: " << std::endl;
+
+			break;
+		}*/
+
+		++part3;
+	}
+}
+
+
+void max_acceleration_and_velocity(particles & vd, double & max_acc, double & max_vel)
+{
+	// Calculate the maximum acceleration
+	auto part = vd.getDomainIterator();
+
+	while (part.isNext())
+	{
+		auto a = part.get();
+
+		Point<3,double> acc(vd.getProp<force>(a));
+		double acc2 = norm2(acc);
+
+		Point<3,double> vel(vd.getProp<velocity>(a));
+		double vel2 = norm2(vel);
+
+		if (vel2 >= max_vel)
+			max_vel = vel2;
+
+		if (acc2 >= max_acc)
+			max_acc = acc2;
+
+		++part;
+	}
+	max_acc = sqrt(max_acc);
+	max_vel = sqrt(max_vel);
+
+	Vcluster & v_cl = create_vcluster();
+	v_cl.max(max_acc);
+	v_cl.max(max_vel);
+	v_cl.execute();
+}
+
+
+double calc_deltaT(particles & vd, double ViscDtMax)
+{
+	double Maxacc = 0.0;
+	double Maxvel = 0.0;
+	max_acceleration_and_velocity(vd,Maxacc,Maxvel);
+
+	//-dt1 depends on force per unit mass.
+	const double dt_f = (Maxacc)?sqrt(H/Maxacc):std::numeric_limits<int>::max();
+
+	//-dt2 combines the Courant and the viscous time-step controls.
+	const double dt_cv = H/(std::max(cbar,Maxvel*10.) + H*ViscDtMax);
+
+	//-dt new value of time step.
+	double dt=double(CFLnumber)*std::min(dt_f,dt_cv);
+	if(dt<double(DtMin))
+		dt=double(DtMin);
+
+	return dt;
+}
+
+
+openfpm::vector<size_t> to_remove;
+
+
+void verlet_int(particles & vd, double dt, double & max_disp)
+{
+	// list of the particle to remove
+	to_remove.clear();
+
+	// particle iterator
+	auto part = vd.getDomainIterator();
+
+	double dt205 = dt*dt*0.5;
+	double dt2 = dt*2.0;
+
+	max_disp = 0;
+
+	// For each particle ...
+	while (part.isNext())
+	{
+		// ... a
+		auto a = part.get();
+
+		// if the particle is boundary
+		if (vd.template getProp<type>(a) == BOUNDARY)
+		{
+			// Update rho
+			double rhop = vd.template getProp<rho>(a);
+
+			// Update only the density
+	    	vd.template getProp<velocity>(a)[0] = 0.0;
+	    	vd.template getProp<velocity>(a)[1] = 0.0;
+	    	vd.template getProp<velocity>(a)[2] = 0.0;
+	    	vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
+
+		    vd.template getProp<rho_prev>(a) = rhop;
+
+			++part;
+			continue;
+		}
+
+		//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
+		double dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
+	    double dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
+	    double dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
+
+	    vd.getPos(a)[0] += dx;
+	    vd.getPos(a)[1] += dy;
+	    vd.getPos(a)[2] += dz;
+
+	    double d2 = dx*dx + dy*dy + dz*dz;
+
+	    max_disp = (max_disp > d2)?max_disp:d2;
+
+	    double velX = vd.template getProp<velocity>(a)[0];
+	    double velY = vd.template getProp<velocity>(a)[1];
+	    double velZ = vd.template getProp<velocity>(a)[2];
+	    double rhop = vd.template getProp<rho>(a);
+
+    	vd.template getProp<velocity>(a)[0] = vd.template getProp<velocity_prev>(a)[0] + vd.template getProp<force>(a)[0]*dt2;
+    	vd.template getProp<velocity>(a)[1] = vd.template getProp<velocity_prev>(a)[1] + vd.template getProp<force>(a)[1]*dt2;
+    	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity_prev>(a)[2] + vd.template getProp<force>(a)[2]*dt2;
+    	vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
+
+	    // Check if the particle go out of range in space and in density
+	    if (vd.getPos(a)[0] <  0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 ||
+	        vd.getPos(a)[0] >  0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.000263878+0.903944 ||
+			vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
+	    {
+	                   to_remove.add(a.getKey());
+
+
+	                   // if we remove something the verlet are not anymore correct and must be reconstructed
+	                   max_disp = 100.0;
+	    }
+
+	    vd.template getProp<velocity_prev>(a)[0] = velX;
+	    vd.template getProp<velocity_prev>(a)[1] = velY;
+	    vd.template getProp<velocity_prev>(a)[2] = velZ;
+	    vd.template getProp<rho_prev>(a) = rhop;
+
+		++part;
+	}
+
+	Vcluster & v_cl = create_vcluster();
+	v_cl.max(max_disp);
+	v_cl.execute();
+
+	max_disp = sqrt(max_disp);
+
+	// remove the particles
+	vd.remove(to_remove,0);
+
+	// increment the iteration counter
+	cnt++;
+}
+
+void euler_int(particles & vd, double dt, double & max_disp)
+{
+	// list of the particle to remove
+	to_remove.clear();
+
+	// particle iterator
+	auto part = vd.getDomainIterator();
+
+	double dt205 = dt*dt*0.5;
+	double dt2 = dt*2.0;
+
+	max_disp = 0;
+
+	// For each particle ...
+	while (part.isNext())
+	{
+		// ... a
+		auto a = part.get();
+
+		// if the particle is boundary
+		if (vd.template getProp<type>(a) == BOUNDARY)
+		{
+			// Update rho
+			double rhop = vd.template getProp<rho>(a);
+
+			// Update only the density
+	    	vd.template getProp<velocity>(a)[0] = 0.0;
+	    	vd.template getProp<velocity>(a)[1] = 0.0;
+	    	vd.template getProp<velocity>(a)[2] = 0.0;
+	    	vd.template getProp<rho>(a) = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
+
+		    vd.template getProp<rho_prev>(a) = rhop;
+
+			++part;
+			continue;
+		}
+
+		//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
+		double dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
+	    double dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
+	    double dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
+
+	    vd.getPos(a)[0] += dx;
+	    vd.getPos(a)[1] += dy;
+	    vd.getPos(a)[2] += dz;
+
+	    double d2 = dx*dx + dy*dy + dz*dz;
+
+	    max_disp = (max_disp > d2)?max_disp:d2;
+
+	    double velX = vd.template getProp<velocity>(a)[0];
+	    double velY = vd.template getProp<velocity>(a)[1];
+	    double velZ = vd.template getProp<velocity>(a)[2];
+	    double rhop = vd.template getProp<rho>(a);
+
+    	vd.template getProp<velocity>(a)[0] = vd.template getProp<velocity>(a)[0] + vd.template getProp<force>(a)[0]*dt;
+    	vd.template getProp<velocity>(a)[1] = vd.template getProp<velocity>(a)[1] + vd.template getProp<force>(a)[1]*dt;
+	   	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity>(a)[2] + vd.template getProp<force>(a)[2]*dt;
+	   	vd.template getProp<rho>(a) = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
+
+	    // Check if the particle go out of range in space and in density
+	    if (vd.getPos(a)[0] <  0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 ||
+	        vd.getPos(a)[0] >  0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.000263878+0.903944 ||
+			vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
+	    {
+	                   to_remove.add(a.getKey());
+
+	                   // if we remove something the verlet are not anymore correct and must be reconstructed
+	                   max_disp = 100.0;
+	    }
+
+	    vd.template getProp<velocity_prev>(a)[0] = velX;
+	    vd.template getProp<velocity_prev>(a)[1] = velY;
+	    vd.template getProp<velocity_prev>(a)[2] = velZ;
+	    vd.template getProp<rho_prev>(a) = rhop;
+
+		++part;
+	}
+
+	// remove the particles
+	vd.remove(to_remove,0);
+
+	Vcluster & v_cl = create_vcluster();
+	v_cl.max(max_disp);
+	v_cl.execute();
+
+	max_disp = sqrt(max_disp);
+
+	// increment the iteration counter
+	cnt++;
+}
+
+
+int main(int argc, char* argv[])
+{
+    // initialize the library
+	openfpm_init(&argc,&argv);
+
+	// Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
+	Box<3,double> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.5025});
+	size_t sz[3] = {207,90,66};
+
+	// Fill W_dap
+	W_dap = 1.0/Wab(H/1.5);
+
+	// Here we define the boundary conditions of our problem
+    size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+
+    double skin = 0.25 * 2*H;
+    double r_gskin = 2*H + skin;
+
+	// extended boundary around the domain, and the processor domain
+    // by the support of the cubic kernel
+	Ghost<3,double> g(r_gskin);
+
+	particles vd(0,domain,bc,g,BIND_DEC_TO_GHOST);
+
+	// You can ignore all these dp/2.0 is a trick to reach the same initialization
+	// of Dual-SPH that use a different criteria to draw particles
+	Box<3,double> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0});
+
+	// return an iterator to the fluid particles to add to vd
+	auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box);
+
+	// here we fill some of the constants needed by the simulation
+	max_fluid_height = fluid_it.getBoxMargins().getHigh(2);
+	h_swl = fluid_it.getBoxMargins().getHigh(2) - fluid_it.getBoxMargins().getLow(2);
+	B = (coeff_sound)*(coeff_sound)*gravity*h_swl*rho_zero / gamma_;
+	cbar = coeff_sound * sqrt(gravity * h_swl);
+
+	// for each particle inside the fluid box ...
+	while (fluid_it.isNext())
+	{
+		// ... add a particle ...
+		vd.add();
+
+		// ... and set it position ...
+		vd.getLastPos()[0] = fluid_it.get().get(0);
+		vd.getLastPos()[1] = fluid_it.get().get(1);
+		vd.getLastPos()[2] = fluid_it.get().get(2);
+
+		// and its type.
+		vd.template getLastProp<type>() = FLUID;
+
+		// We also initialize the density of the particle and the hydro-static pressure given by
+		//
+		// rho_zero*g*h = P
+		//
+		// rho_p = (P/B + 1)^(1/Gamma) * rho_zero
+		//
+
+		vd.template getLastProp<Pressure>() = rho_zero * gravity *  (max_fluid_height - fluid_it.get().get(2));
+
+		vd.template getLastProp<rho>() = pow(vd.template getLastProp<Pressure>() / B + 1, 1.0/gamma_) * rho_zero;
+		vd.template getLastProp<rho_prev>() = vd.template getLastProp<rho>();
+		vd.template getLastProp<velocity>()[0] = 0.0;
+		vd.template getLastProp<velocity>()[1] = 0.0;
+		vd.template getLastProp<velocity>()[2] = 0.0;
+
+		vd.template getLastProp<velocity_prev>()[0] = 0.0;
+		vd.template getLastProp<velocity_prev>()[1] = 0.0;
+		vd.template getLastProp<velocity_prev>()[2] = 0.0;
+
+		// next fluid particle
+		++fluid_it;
+	}
+
+
+	// Recipient
+	Box<3,double> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0});
+	Box<3,double> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0});
+
+	Box<3,double> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0});
+	Box<3,double> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0});
+	Box<3,double> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45});
+
+	openfpm::vector<Box<3,double>> holes;
+	holes.add(recipient2);
+	holes.add(obstacle1);
+	auto bound_box = DrawParticles::DrawSkin(vd,sz,domain,holes,recipient1);
+
+	while (bound_box.isNext())
+	{
+		vd.add();
+
+		vd.getLastPos()[0] = bound_box.get().get(0);
+		vd.getLastPos()[1] = bound_box.get().get(1);
+		vd.getLastPos()[2] = bound_box.get().get(2);
+
+		vd.template getLastProp<type>() = BOUNDARY;
+		vd.template getLastProp<rho>() = rho_zero;
+		vd.template getLastProp<rho_prev>() = rho_zero;
+		vd.template getLastProp<velocity>()[0] = 0.0;
+		vd.template getLastProp<velocity>()[1] = 0.0;
+		vd.template getLastProp<velocity>()[2] = 0.0;
+
+		vd.template getLastProp<velocity_prev>()[0] = 0.0;
+		vd.template getLastProp<velocity_prev>()[1] = 0.0;
+		vd.template getLastProp<velocity_prev>()[2] = 0.0;
+
+		++bound_box;
+	}
+
+	auto obstacle_box = DrawParticles::DrawSkin(vd,sz,domain,obstacle2,obstacle1);
+
+	while (obstacle_box.isNext())
+	{
+		vd.add();
+
+		vd.getLastPos()[0] = obstacle_box.get().get(0);
+		vd.getLastPos()[1] = obstacle_box.get().get(1);
+		vd.getLastPos()[2] = obstacle_box.get().get(2);
+
+		vd.template getLastProp<type>() = BOUNDARY;
+		vd.template getLastProp<rho>() = rho_zero;
+		vd.template getLastProp<rho_prev>() = rho_zero;
+		vd.template getLastProp<velocity>()[0] = 0.0;
+		vd.template getLastProp<velocity>()[1] = 0.0;
+		vd.template getLastProp<velocity>()[2] = 0.0;
+
+		vd.template getLastProp<velocity_prev>()[0] = 0.0;
+		vd.template getLastProp<velocity_prev>()[1] = 0.0;
+		vd.template getLastProp<velocity_prev>()[2] = 0.0;
+
+		++obstacle_box;
+	}
+
+	vd.map();
+
+	// Now that we fill the vector with particles
+	ModelCustom md;
+
+	vd.addComputationCosts(md);
+	vd.getDecomposition().decompose();
+	vd.map();
+
+	size_t loc = vd.size_local();
+	openfpm::vector<size_t> gt;
+
+	Vcluster & v_cl = create_vcluster();
+	v_cl.allGather(loc,gt);
+	v_cl.execute();
+	
+
+	auto debug_it = vd.getDomainIterator();
+
+
+	size_t tot = 0;
+
+
+	for (size_t i ; i < create_vcluster().getProcessUnitID() ; i++)
+		tot += gt.get(i);
+
+	while(debug_it.isNext())
+	{
+		auto a = debug_it.get();
+
+		vd.getProp<12>(a) = tot;
+		tot++;
+
+		++debug_it;
+	}
+
+	vd.ghost_get<type,rho,Pressure,velocity,12>();
+
+	auto NN = vd.getVerletCrs(r_gskin);
+
+	// Evolve
+
+	size_t write = 0;
+	size_t it = 0;
+	size_t it_reb = 0;
+	double t = 0.0;
+	double tot_disp = 0.0;
+	double max_disp;
+	while (t <= t_end)
+	{
+		Vcluster & v_cl = create_vcluster();
+		timer it_time;
+
+		it_reb++;
+//		if (2*tot_disp >= skin)
+//		{
+			vd.map();
+
+			vd.getDecomposition().write("DecBEFORE");
+
+			if (it_reb > 5)
+			{
+				ModelCustom md;
+				vd.addComputationCosts(md);
+				vd.getDecomposition().decompose();
+
+				vd.map();
+
+				it_reb = 0;
+
+				if (v_cl.getProcessUnitID() == 0)
+					std::cout << "REBALANCED " << std::endl;
+
+			}
+
+			vd.getDecomposition().write("DecAFTER");
+
+			// Calculate pressure from the density
+			EqState(vd);
+
+			vd.ghost_get<type,rho,Pressure,velocity,12>();
+
+			//vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC);
+			NN = vd.getVerletCrs(r_gskin);
+
+			vd.write("Debug");
+
+			tot_disp = 0.0;
+
+			if (v_cl.getProcessUnitID() == 0)
+				std::cout << "RECONSTRUCT Verlet " << std::endl;
+//		}
+//		else
+//		{
+//			// Calculate pressure from the density
+//			EqState(vd);
+//
+//			vd.ghost_get<type,rho,Pressure,velocity>(SKIP_LABELLING);
+//		}
+
+		double max_visc = 0.0;
+
+		// Calc forces
+		calc_forces(vd,NN,max_visc,r_gskin);
+
+		// Get the maximum viscosity term across processors
+		v_cl.max(max_visc);
+		v_cl.execute();
+
+		// Calculate delta t integration
+		double dt = calc_deltaT(vd,max_visc);
+
+		// VerletStep or euler step
+		it++;
+		if (it < 40)
+			verlet_int(vd,dt,max_disp);
+		else
+		{
+			euler_int(vd,dt,max_disp);
+			it = 0;
+		}
+
+		tot_disp += max_disp;
+		t += dt;
+
+		if (write < t*100)
+		{
+//			vd.deleteGhost();
+			vd.write("Geometry",write);
+//			vd.ghost_get<type,rho,Pressure,velocity>(SKIP_LABELLING);
+			write++;
+
+			if (v_cl.getProcessUnitID() == 0)
+				std::cout << "TIME: " << t << "  write " << it_time.getwct() << "   " << v_cl.getProcessUnitID() << "  TOT disp: " << tot_disp << "    " << cnt << std::endl;
+		}
+		else
+		{
+			if (v_cl.getProcessUnitID() == 0)
+				std::cout << "TIME: " << t << "  " << it_time.getwct() << "   " << v_cl.getProcessUnitID() << "  TOT disp: " << tot_disp << "    " << cnt << std::endl;
+		}
+	}
+
+	openfpm_finalize();
+}
+ 
diff --git a/openfpm_data b/openfpm_data
index 0261ed4b..8d7bbbfa 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 0261ed4b804309864d899bedec46a67e5e6cfe95
+Subproject commit 8d7bbbfa94b4ef203f6e3e7af5d3bcdf3eea5707
diff --git a/script/install_EIGEN.sh b/script/install_EIGEN.sh
index d01089a0..0b67506e 100755
--- a/script/install_EIGEN.sh
+++ b/script/install_EIGEN.sh
@@ -17,16 +17,19 @@ if [ ! -d "$1/SUITESPARSE"  ]; then
   exit 1
 fi
 
-wget http://ppmcore.mpi-cbg.de/upload/eigen-3.3.1.tar.bz2
-rm -rf eigen-eigen-f562a193118d
-tar -xf eigen-3.3.1.tar.bz2
+rm -rf eigen-3.3.5.tar.bz2
+wget http://ppmcore.mpi-cbg.de/upload/eigen-3.3.5.tar.bz2
+rm -rf eigen-eigen-b3f3d4950030/
+tar -xf eigen-3.3.5.tar.bz2
 
-cd eigen-eigen-f562a193118d
+cd eigen-eigen-b3f3d4950030/
 mkdir $1/EIGEN/
 mv Eigen $1/EIGEN/Eigen
 
 cd ..
-rm -rf eigen-eigen-f562a193118d
+rm -rf eigen-eigen-b3f3d4950030/
+
+touch $1/EIGEN/signature_of_eigen3_matrix_library
 
 # Mark the installation
-echo 1 > $1/EIGEN/version
+echo 2 > $1/EIGEN/version
diff --git a/script/install_OPENBLAS.sh b/script/install_OPENBLAS.sh
index 6b6925f6..09129796 100755
--- a/script/install_OPENBLAS.sh
+++ b/script/install_OPENBLAS.sh
@@ -7,8 +7,8 @@ if [ -d "$1/OPENBLAS" ]; then
   exit 0
 fi
 
+rm -rf OpenBLAS-0.2.20.tar.gz
 wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.2.20.tar.gz
-rm -rf OpenBLAS-0.2.20
 tar -xf OpenBLAS-0.2.20.tar.gz
 cd OpenBLAS-0.2.20
 
@@ -26,7 +26,7 @@ make install PREFIX=$1/OPENBLAS
 if [ ! "$(ls -A $1/OPENBLAS)" ]; then
    rm -rf $1/OPENBLAS
 else
-   rm -rf OpenBLAS-0.2.19
+   rm -rf OpenBLAS-0.2.20
    echo 1 > $1/OPENBLAS/version
    exit 0
 fi
diff --git a/script/remove_old b/script/remove_old
index 459682d9..4e39fa1b 100755
--- a/script/remove_old
+++ b/script/remove_old
@@ -201,6 +201,7 @@ function remove_old()
             rm -rf $1/MPI/share
             rm -rf $1/MPI
             rm -rf $1/HDF5
+	    rm -rf $1/ZLIB
             rm -rf $1/PARMETIS
             rm -rf $1/PETSC
             rm -rf $1/TRILINOS
@@ -212,10 +213,10 @@ function remove_old()
 
     if [ -d $1/EIGEN ]; then
         version=$(cat $1/EIGEN/version)
-        if [ x"$version" != x"1"  ]; then
-            echo -e "\033[1;34;5m  ---------------------------------------------------------------------- \033[0m"
-            echo -e "\033[1;34;5m  EIGEN has been updated, the component will be updated automatically    \033[0m"
-            echo -e "\033[1;34;5m  ---------------------------------------------------------------------- \033[0m"
+        if [ x"$version" != x"2"  ]; then
+            echo -e "\033[1;34;5m  -------------------------------------------------------------------------------- \033[0m"
+            echo -e "\033[1;34;5m  EIGEN has been updated to 3.3.5 , the component will be updated automatically    \033[0m"
+            echo -e "\033[1;34;5m  -------------------------------------------------------------------------------- \033[0m"
             sleep 5
             rm -rf $1/EIGEN/Eigen
             rm -rf $1/EIGEN
diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp
index c651d93a..e6706447 100755
--- a/src/Decomposition/CartDecomposition.hpp
+++ b/src/Decomposition/CartDecomposition.hpp
@@ -130,7 +130,7 @@ template<unsigned int dim> static void nsub_to_div(size_t (& div)[dim], size_t n
  */
 
 template<unsigned int dim, typename T, typename Memory, template <typename> class layout_base, typename Distribution>
-class CartDecomposition: public ie_loc_ghost<dim, T>, public nn_prcs<dim, T>, public ie_ghost<dim, T>, public domain_nn_calculator_cart<dim>
+class CartDecomposition: public ie_loc_ghost<dim, T>, public nn_prcs<dim, T>, public ie_ghost<dim,T,Memory,layout_base>, public domain_nn_calculator_cart<dim>
 {
 public:
 
@@ -458,10 +458,10 @@ public:
 			{div[i] = (size_t) ((bound.getHigh(i) - bound.getLow(i)) / cd.getCellBox().getP2()[i]);}
 
 			// Initialize the geo_cell structure
-			ie_ghost<dim,T>::Initialize_geo_cell(bound,div);
+			ie_ghost<dim,T,Memory,layout_base>::Initialize_geo_cell(bound,div);
 
 			// Initialize shift vectors
-			ie_ghost<dim,T>::generateShiftVectors(domain,bc);
+			ie_ghost<dim,T,Memory,layout_base>::generateShiftVectors(domain,bc);
 		}
 	}
 
@@ -643,8 +643,8 @@ public:
 		// Intersect all the local sub-domains with the sub-domains of the contiguous processors
 
 		// create the internal structures that store ghost information
-		ie_ghost<dim, T>::create_box_nn_processor_ext(v_cl, ghost, sub_domains, box_nn_processor, *this);
-		ie_ghost<dim, T>::create_box_nn_processor_int(v_cl, ghost, sub_domains, box_nn_processor, *this);
+		ie_ghost<dim, T,Memory,layout_base>::create_box_nn_processor_ext(v_cl, ghost, sub_domains, box_nn_processor, *this);
+		ie_ghost<dim, T,Memory,layout_base>::create_box_nn_processor_int(v_cl, ghost, sub_domains, box_nn_processor, *this);
 
 		ie_loc_ghost<dim,T>::create(sub_domains,domain,ghost,bc);
 	}
@@ -776,9 +776,9 @@ public:
 		 * \return processor id
 		 *
 		 */
-		inline static size_t id(p_box<dim, T> & p, size_t b_id)
+		template<typename encap_type> inline static size_t id(const encap_type & p, size_t b_id)
 		{
-			return p.proc;
+			return p.template get<proc_>();
 		}
 	};
 
@@ -796,9 +796,9 @@ public:
 		 * \return local processor id
 		 *
 		 */
-		inline static size_t id(p_box<dim, T> & p, size_t b_id)
+		template<typename encap_type> inline static size_t id(const encap_type & p, size_t b_id)
 		{
-			return p.lc_proc;
+			return p.template get<lc_proc_>();
 		}
 	};
 
@@ -816,9 +816,9 @@ public:
 		 * \return shift_id id
 		 *
 		 */
-		inline static size_t id(p_box<dim,T> & p, size_t b_id)
+		template<typename encap_type> inline static size_t id(const encap_type & p, size_t b_id)
 		{
-			return p.shift_id;
+			return p.template get<shift_id_>();
 		}
 	};
 
@@ -925,7 +925,7 @@ public:
 
 		(static_cast<ie_loc_ghost<dim,T>*>(&cart))->operator=(static_cast<ie_loc_ghost<dim,T>>(*this));
 		(static_cast<nn_prcs<dim,T>*>(&cart))->operator=(static_cast<nn_prcs<dim,T>>(*this));
-		(static_cast<ie_ghost<dim,T>*>(&cart))->operator=(static_cast<ie_ghost<dim,T>>(*this));
+		(static_cast<ie_ghost<dim,T,Memory,layout_base>*>(&cart))->operator=(static_cast<ie_ghost<dim,T,Memory,layout_base>>(*this));
 
 		cart.sub_domains = sub_domains;
 		cart.box_nn_processor = box_nn_processor;
@@ -961,7 +961,7 @@ public:
 	{
 		static_cast<ie_loc_ghost<dim,T>*>(this)->operator=(static_cast<ie_loc_ghost<dim,T>>(cart));
 		static_cast<nn_prcs<dim,T>*>(this)->operator=(static_cast<nn_prcs<dim,T>>(cart));
-		static_cast<ie_ghost<dim,T>*>(this)->operator=(static_cast<ie_ghost<dim,T>>(cart));
+		static_cast<ie_ghost<dim,T,Memory,layout_base>*>(this)->operator=(static_cast<ie_ghost<dim,T,Memory,layout_base>>(cart));
 
 		sub_domains = cart.sub_domains;
 		box_nn_processor = cart.box_nn_processor;
@@ -1001,7 +1001,7 @@ public:
 	{
 		static_cast<ie_loc_ghost<dim,T>*>(this)->operator=(static_cast<ie_loc_ghost<dim,T>>(cart));
 		static_cast<nn_prcs<dim,T>*>(this)->operator=(static_cast<nn_prcs<dim,T>>(cart));
-		static_cast<ie_ghost<dim,T>*>(this)->operator=(static_cast<ie_ghost<dim,T>>(cart));
+		static_cast<ie_ghost<dim,T,Memory,layout_base>*>(this)->operator=(static_cast<ie_ghost<dim,T,Memory,layout_base>>(cart));
 
 		sub_domains.swap(cart.sub_domains);
 		box_nn_processor.swap(cart.box_nn_processor);
@@ -1314,7 +1314,7 @@ public:
 		fine_s.clear();
 		loc_box.clear();
 		nn_prcs<dim, T>::reset();
-		ie_ghost<dim, T>::reset();
+		ie_ghost<dim,T,Memory,layout_base>::reset();
 		ie_loc_ghost<dim, T>::reset();
 	}
 
@@ -1550,11 +1550,6 @@ public:
 		return sub_domains;
 	}
 
-/*	openfpm::vector<openfpm::vector<SpaceBox<dim, T>>> & getSubDomainsGlobal()
-	{
-		return sub_domains_global;
-	}*/
-
 	/*! \brief Check if the particle is local
 	 *
 	 * \warning if the particle id outside the domain the result is unreliable
@@ -1803,7 +1798,7 @@ public:
 		vtk_box1.write(output + std::string("subdomains_") + std::to_string(v_cl.getProcessUnitID()) + std::string(".vtk"));
 
 		nn_prcs<dim, T>::write(output);
-		ie_ghost<dim, T>::write(output, v_cl.getProcessUnitID());
+		ie_ghost<dim,T,Memory,layout_base>::write(output, v_cl.getProcessUnitID());
 		ie_loc_ghost<dim, T>::write(output, v_cl.getProcessUnitID());
 
 		return true;
@@ -1856,9 +1851,9 @@ public:
 
 		for (size_t p = 0; p<nn_prcs < dim, T>::getNNProcessors(); p++)
 		{
-			for (size_t i = 0; i<ie_ghost < dim, T>::getProcessorNEGhost(p); i++)
+			for (size_t i = 0; i<ie_ghost <dim,T,Memory,layout_base>::getProcessorNEGhost(p); i++)
 			{
-				std::cout << ie_ghost<dim, T>::getProcessorEGhostBox(p, i).toString() << "   prc=" << nn_prcs<dim, T>::IDtoProc(p) << "   id=" << ie_ghost<dim, T>::getProcessorEGhostId(p, i) << "\n";
+				std::cout << ie_ghost<dim,T,Memory,layout_base>::getProcessorEGhostBox(p, i).toString() << "   prc=" << nn_prcs<dim, T>::IDtoProc(p) << "   id=" << ie_ghost<dim,T,Memory,layout_base>::getProcessorEGhostId(p, i) << "\n";
 			}
 		}
 
@@ -1866,9 +1861,9 @@ public:
 
 		for (size_t p = 0; p<nn_prcs < dim, T>::getNNProcessors(); p++)
 		{
-			for (size_t i = 0; i<ie_ghost < dim, T>::getProcessorNIGhost(p); i++)
+			for (size_t i = 0; i<ie_ghost<dim,T,Memory,layout_base>::getProcessorNIGhost(p); i++)
 			{
-				std::cout << ie_ghost<dim, T>::getProcessorIGhostBox(p, i).toString() << "   prc=" << nn_prcs<dim, T>::IDtoProc(p) << "   id=" << ie_ghost<dim, T>::getProcessorIGhostId(p, i) << "\n";
+				std::cout << ie_ghost<dim,T,Memory,layout_base>::getProcessorIGhostBox(p, i).toString() << "   prc=" << nn_prcs<dim, T>::IDtoProc(p) << "   id=" << ie_ghost<dim,T,Memory,layout_base>::getProcessorIGhostId(p, i) << "\n";
 			}
 		}
 	}
@@ -1888,7 +1883,7 @@ public:
 		if (static_cast<nn_prcs<dim,T>*>(this)->is_equal(static_cast<nn_prcs<dim,T>&>(cart)) == false)
 			return false;
 
-		if (static_cast<ie_ghost<dim,T>*>(this)->is_equal(static_cast<ie_ghost<dim,T>&>(cart)) == false)
+		if (static_cast<ie_ghost<dim,T,Memory,layout_base>*>(this)->is_equal(static_cast<ie_ghost<dim,T,Memory,layout_base>&>(cart)) == false)
 			return false;
 
 		if (sub_domains != cart.sub_domains)
@@ -1934,7 +1929,7 @@ public:
 		if (static_cast<nn_prcs<dim,T>*>(this)->is_equal(static_cast<nn_prcs<dim,T>&>(cart)) == false)
 			return false;
 
-		if (static_cast<ie_ghost<dim,T>*>(this)->is_equal_ng(static_cast<ie_ghost<dim,T>&>(cart)) == false)
+		if (static_cast<ie_ghost<dim,T,Memory,layout_base>*>(this)->is_equal_ng(static_cast<ie_ghost<dim,T,Memory,layout_base>&>(cart)) == false)
 			return false;
 
 		if (sub_domains != cart.sub_domains)
@@ -2024,6 +2019,7 @@ public:
 		for (int i = 0 ; i < dim ; i++)	{bc_[i] = this->periodicity(i);}
 
 		CartDecomposition_gpu<dim,T,Memory,layout_base> cdg(fine_s.toKernel(),
+															ie_ghost<dim,T,Memory,layout_base>::toKernel(),
 												sub_domains_global.toKernel(),
 												getDomain(),
 												bc_);
diff --git a/src/Decomposition/cuda/CartDecomposition_gpu.cuh b/src/Decomposition/cuda/CartDecomposition_gpu.cuh
index a6592116..2b4505c7 100644
--- a/src/Decomposition/cuda/CartDecomposition_gpu.cuh
+++ b/src/Decomposition/cuda/CartDecomposition_gpu.cuh
@@ -8,24 +8,7 @@
 #ifndef CARTDECOMPOSITION_GPU_HPP_
 #define CARTDECOMPOSITION_GPU_HPP_
 
-#ifdef __NVCC__
-
-template<typename cartdec_gpu, typename particles_type, typename vector_out>
-__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output , int rank)
-{
-    int p = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if (p >= parts.size()) return;
-
-	Point<3,float> xp = parts.template get<0>(p);
-
-	int pr = cdg.processorIDBC(xp);
-
-	output.template get<1>(p) = (pr == rank)?-1:pr;
-	output.template get<0>(p) = p;
-}
-
-#endif
+#include "ie_ghost_gpu.cuh"
 
 template<typename T2, typename fine_s_type, typename vsub_domain_type>
 __device__ __host__ inline int processorID_impl(T2 & p, fine_s_type & fine_s, vsub_domain_type & sub_domains_global)
@@ -62,7 +45,7 @@ __device__ __host__ inline int processorID_impl(T2 & p, fine_s_type & fine_s, vs
 }
 
 template<unsigned int dim, typename T, typename Memory, template <typename> class layout_base>
-class CartDecomposition_gpu
+class CartDecomposition_gpu : public ie_ghost_gpu<dim,T,Memory,layout_base>
 {
 	CellList_cpu_ker<dim,T,Mem_fast_ker<Memory,memory_traits_lin,int>,shift<dim,T>> clk;
 
@@ -93,17 +76,18 @@ class CartDecomposition_gpu
 public:
 
 	CartDecomposition_gpu(CellList_cpu_ker<dim,T,Mem_fast_ker<Memory,memory_traits_lin,int>,shift<dim,T>> clk,
+						  ie_ghost_gpu<dim,T,Memory,layout_base> ieg,
 						  openfpm::vector_gpu_ker<Box_map<dim, T>,layout_base> sub_domains_global,
 						  const Box<dim,T> & domain,
 						  const int (& bc)[dim])
-	:clk(clk),domain(domain),sub_domains_global(sub_domains_global)
+	:ie_ghost_gpu<dim,T,Memory,layout_base>(ieg),clk(clk),domain(domain),sub_domains_global(sub_domains_global)
 	{
 		for (int s = 0 ; s < dim ; s++)
 		{this->bc[s] = bc[s];}
 	}
 
 	CartDecomposition_gpu(const CartDecomposition_gpu<dim,T,Memory,layout_base> & dec)
-	:clk(dec.clk),domain(dec.domain)
+	:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain)
 	{
 		for (int s = 0 ; s < dim ; s++)
 		{this->bc[s] = dec.bc[s];}
@@ -125,7 +109,6 @@ public:
 
 		return processorID_impl(pt,clk,sub_domains_global);
 	}
-
 };
 
 #endif /* CARTDECOMPOSITION_GPU_HPP_ */
diff --git a/src/Decomposition/cuda/decomposition_cuda_tests.cu b/src/Decomposition/cuda/decomposition_cuda_tests.cu
index b8322092..84ac2397 100644
--- a/src/Decomposition/cuda/decomposition_cuda_tests.cu
+++ b/src/Decomposition/cuda/decomposition_cuda_tests.cu
@@ -9,70 +9,4 @@
 BOOST_AUTO_TEST_SUITE( decomposition_to_gpu_test )
 
 
-BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
-{
-	auto & v_cl = create_vcluster();
-
-	// Vcluster
-	Vcluster<> & vcl = create_vcluster();
-
-	CartDecomposition<3, float, CudaMemory, memory_traits_inte> dec(vcl);
-
-	// Physical domain
-	Box<3, float> box( { 0.0, 0.0, 0.0 }, { 1.0, 1.0, 1.0 });
-	size_t div[3];
-
-	// Get the number of processor and calculate the number of sub-domain
-	// for each processor (SUB_UNIT_FACTOR=64)
-	size_t n_proc = vcl.getProcessingUnits();
-	size_t n_sub = n_proc * SUB_UNIT_FACTOR;
-
-	// Set the number of sub-domains on each dimension (in a scalable way)
-	for (int i = 0; i < 3; i++)
-	{	div[i] = openfpm::math::round_big_2(pow(n_sub,1.0/3));}
-
-	// Define ghost
-	Ghost<3, float> g(0.01);
-
-	// Boundary conditions
-	size_t bc[] = { PERIODIC, PERIODIC, PERIODIC };
-
-	// Decompose
-	dec.setParameters(div,box,bc,g);
-	dec.decompose();
-
-	openfpm::vector_gpu<Point<3,float>> vg;
-	vg.resize(10000);
-
-	for (size_t i = 0 ; i < 10000 ; i++)
-	{
-		vg.template get<0>(i)[0] = (float)rand()/RAND_MAX;
-		vg.template get<0>(i)[1] = (float)rand()/RAND_MAX;
-		vg.template get<0>(i)[2] = (float)rand()/RAND_MAX;
-	}
-
-	vg.hostToDevice<0>();
-
-	// process on GPU the processor ID for each particles
-
-	auto ite = vg.getGPUIterator();
-
-	openfpm::vector_gpu<aggregate<int,int>> proc_id_out;
-	proc_id_out.resize(vg.size());
-
-	process_id_proc_each_part<decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
-	<<<ite.wthr,ite.thr>>>
-	(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),v_cl.rank());
-
-	proc_id_out.deviceToHost<0>();
-
-	bool match = true;
-	for (size_t i = 0 ; i < proc_id_out.size() ; i++)
-	{
-		Point<3,float> xp = vg.template get<0>(i);
-
-		match &= proc_id_out.template get<0>(i) == dec.processorIDBC(xp);
-	}
-}
-
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Decomposition/cuda/ie_ghost_gpu.cuh b/src/Decomposition/cuda/ie_ghost_gpu.cuh
new file mode 100644
index 00000000..e0a6f8fb
--- /dev/null
+++ b/src/Decomposition/cuda/ie_ghost_gpu.cuh
@@ -0,0 +1,126 @@
+/*
+ * ie_ghost_gpu.cuh
+ *
+ *  Created on: Aug 24, 2018
+ *      Author: i-bird
+ */
+
+#ifndef IE_GHOST_GPU_CUH_
+#define IE_GHOST_GPU_CUH_
+
+#include "data_type/aggregate.hpp"
+
+constexpr unsigned int lc_proc_ = 0;
+constexpr unsigned int proc_ = 1;
+constexpr unsigned int shift_id_ = 2;
+
+template<unsigned int dim, typename T, typename cell_list_type, typename vb_int_box_type, typename vb_int_type>
+__device__ __host__ inline unsigned int ghost_processorID_N_impl(const Point<dim,T> & p, cell_list_type & geo_cell, vb_int_box_type & vb_int_box, vb_int_type & vb_int)
+{
+	unsigned int cell = geo_cell.getCell(p);
+	unsigned int sz = geo_cell.getNelements(cell);
+
+	unsigned int n = 0;
+
+	for (int i = 0 ; i < sz ; i++)
+	{
+		unsigned int bid = geo_cell.get(cell,i);
+
+		if (Box<dim,T>(vb_int_box.get(bid)).isInsideNP(p) == true)
+		{n++;}
+	}
+
+	return n;
+}
+
+/*! \brief structure that store and compute the internal and external local ghost box. Version usable in kernel
+ *
+ * \tparam dim is the dimensionality of the physical domain we are going to decompose.
+ * \tparam T type of the space we decompose, Real, Integer, Complex ...
+ *
+ * \see CartDecomposition
+ *
+ */
+template<unsigned int dim, typename T, typename Memory, template<typename> class layout_base>
+class ie_ghost_gpu
+{
+
+	//! Cell-list that store the geometrical information of the internal ghost boxes
+	CellList_cpu_ker<dim,T,Mem_fast_ker<Memory,memory_traits_lin,int>,shift<dim,T>> geo_cell;
+
+	//! internal ghost box
+	openfpm::vector_gpu_ker<Box<dim, T>,layout_base> vb_int_box;
+
+	//! internal ghost box
+	openfpm::vector_gpu_ker<aggregate<unsigned int,unsigned int,unsigned int>,layout_base> vb_int;
+
+	// maximum rank
+	unsigned int max_rank;
+
+public:
+
+
+	ie_ghost_gpu(CellList_cpu_ker<dim,T,Mem_fast_ker<Memory,memory_traits_lin,int>,shift<dim,T>> geo_cell,
+			     openfpm::vector_gpu_ker<Box<dim, T>,layout_base> vb_int_box,
+			     openfpm::vector_gpu_ker<aggregate<unsigned int,unsigned int,unsigned int>,layout_base> vb_int,
+			     unsigned int max_rank)
+	:geo_cell(geo_cell),vb_int_box(vb_int_box),vb_int(vb_int),max_rank(max_rank)
+	{
+
+	}
+
+	ie_ghost_gpu(const ie_ghost_gpu<dim,T,Memory,layout_base> & ieg)
+	:geo_cell(ieg.geo_cell),vb_int_box(ieg.vb_int_box),vb_int(ieg.vb_int),max_rank(ieg.max_rank)
+	{}
+
+	/*! \brief Get the cell from the particle position
+	 *
+	 * \param p position of the particle
+	 *
+	 */
+	__device__ inline unsigned int ghost_processorID_cell(const Point<dim,T> & p)
+	{
+		return geo_cell.getCell(p);
+	}
+
+	/*! \brief Get the number of processor a particle must sent
+	 *
+	 * \param p position of the particle
+	 *
+	 */
+	__device__ inline unsigned int ghost_processorID_N(const Point<dim,T> & p)
+	{
+		return ghost_processorID_N_impl(p,geo_cell,vb_int_box,vb_int);
+	}
+
+	/*! \brief Get the number of processor a particle must sent
+	 *
+	 * \param p position of the particle
+	 *
+	 */
+	template<typename output_type> __device__ inline void ghost_processor_ID(const Point<dim,T> & p, output_type & output, unsigned int base, unsigned int pi)
+	{
+		unsigned int cell = geo_cell.getCell(p);
+		unsigned int sz = geo_cell.getNelements(cell);
+
+		unsigned int n = 0;
+
+		for (int i = 0 ; i < sz ; i++)
+		{
+			unsigned int bid = geo_cell.get(cell,i);
+
+			if (Box<dim,T>(vb_int_box.get(bid)).isInsideNP(p) == true)
+			{
+				output.template get<0>(base+n) = vb_int.template get<proc_>(bid);
+				output.template get<1>(base+n) = pi;
+
+				n++;
+			}
+		}
+	}
+
+};
+
+
+
+#endif /* IE_GHOST_GPU_CUH_ */
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index ed5a3341..b122ee4b 100755
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -11,6 +11,7 @@
 #include "common.hpp"
 #include "nn_processor.hpp"
 #include "Decomposition/shift_vect_converter.hpp"
+#include "Decomposition/cuda/ie_ghost_gpu.cuh"
 
 
 /*! \brief structure that store and compute the internal and external local ghost box
@@ -21,7 +22,7 @@
  * \see CartDecomposition
  *
  */
-template<unsigned int dim, typename T>
+template<unsigned int dim, typename T, typename Memory, template<typename> class layout_base >
 class ie_ghost
 {
 	//! for each sub-domain (first vector), contain the list (nested vector) of the neighborhood processors
@@ -37,10 +38,21 @@ class ie_ghost
 	openfpm::vector<p_box<dim,T> > vb_ext;
 
 	//! Internal ghost boxes for this processor domain
-	openfpm::vector<p_box<dim,T> > vb_int;
+	openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int,unsigned int>>::type,layout_base> vb_int;
+
+	//! Internal ghost boxes for this processor domain
+	openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> vb_int_box;
 
 	//! Cell-list that store the geometrical information of the internal ghost boxes
-	CellList<dim,T,Mem_fast<>,shift<dim,T>> geo_cell;
+	CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> geo_cell;
+
+	//! Cell-list that store the geometrical information of the internal ghost boxes (on a processor based lavel)
+	CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> geo_cell_proc;
+
+	typedef openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> proc_boxes;
+
+	//! internal ghost Boxes for each processor
+	openfpm::vector<aggregate<proc_boxes>,Memory,typename layout_base<aggregate<proc_boxes>>::type,layout_base> vb_int_proc;
 
 	//! shift vectors
 	openfpm::vector<Point<dim,T>> shifts;
@@ -54,6 +66,9 @@ class ie_ghost
 	//! shift converter
 	shift_vect_converter<dim,T> sc_convert;
 
+	//! host to device transfer
+	bool host_dev_transfer = false;
+
 	/*! \brief Given a local sub-domain i, it give the id of such sub-domain in the sent list
 	 *         for the processor p_id
 	 *
@@ -175,6 +190,9 @@ protected:
 	{
 		// Initialize the geo_cell structure
 		geo_cell.Initialize(domain,div,0);
+
+		// Initialize the geo_cell structure
+		geo_cell_proc.Initialize(domain,div,0);
 	}
 
 	/*! \brief Create the box_nn_processor_int (bx part)  structure
@@ -275,8 +293,33 @@ protected:
 				}
 			}
 		}
+
+//		construct_vb_int_proc();
 	}
 
+	/*! \brief construct the vb_int_proc box
+	 *
+	 *
+	 */
+/*	void construct_vb_int_proc()
+	{
+		vb_int_proc.resize(proc_int_box.size());
+
+		for (size_t i = 0 ; i < proc_int_box.size() ; i++)
+		{
+			vb_int_proc.template get<0>(i).resize(proc_int_box.get(i).ibx.size());
+
+			for (size_t j = 0 ; j < proc_int_box.get(i).ibx.size() ; j++)
+			{
+				for (size_t k = 0 ; k < dim ; k++)
+				{
+					vb_int_proc.template get<0>(i).template get<0>(j)[k] = proc_int_box.get(i).ibx.get(j).bx.getLow(k);
+					vb_int_proc.template get<0>(i).template get<1>(j)[k] = proc_int_box.get(i).ibx.get(j).bx.getHigh(k);
+				}
+			}
+		}
+	}*/
+
 	/*! \brief Create the box_nn_processor_int (nbx part) structure, the geo_cell list and proc_int_box
 	 *
 	 * This structure store for each sub-domain of this processors the boxes that come from the intersection
@@ -336,20 +379,24 @@ protected:
 					n_sub.enlarge(ghost);
 
 					// Intersect with the local sub-domain
-					p_box<dim,T> b_int;
-					bool intersect = n_sub.Intersect(l_sub,b_int.box);
+					Box<dim,T> b_int;
+					bool intersect = n_sub.Intersect(l_sub,b_int);
 
 					// store if it intersect
 					if (intersect == true)
 					{
+						vb_int.add();
+
+						size_t last = vb_int.size() - 1;
+
 						// the box fill with the processor id
-						b_int.proc = p_id;
+						vb_int.template get<proc_>(last) = p_id;
 
 						// fill the local processor id
-						b_int.lc_proc = lc_proc;
+						vb_int.template get<lc_proc_>(last) = lc_proc;
 
 						// fill the shift id
-						b_int.shift_id = convertShift(nn_p_box_pos.get(k));
+						vb_int.template get<shift_id_>(last) = convertShift(nn_p_box_pos.get(k));
 
 						//
 						// Updating
@@ -364,12 +411,12 @@ protected:
 
 						// add the box to the near processor sub-domain intersections
 						openfpm::vector< ::Box<dim,T> > & p_box_int = box_nn_processor_int.get(i).get(j).nbx;
-						p_box_int.add(b_int.box);
-						vb_int.add(b_int);
+						p_box_int.add(b_int);
+						vb_int_box.add(b_int);
 
 						// store the box in proc_int_box storing from which sub-domain they come from
 						Box_sub<dim,T> sb;
-						sb.bx = b_int.box;
+						sb.bx = b_int;
 						sb.sub = i;
 						sb.r_sub = r_sub.get(k);
 						sb.cmb = nn_p_box_pos.get(k);
@@ -388,8 +435,8 @@ protected:
 						// update the geo_cell list
 
 						// get the cells this box span
-						const grid_key_dx<dim> p1 = geo_cell.getCellGrid(b_int.box.getP1());
-						const grid_key_dx<dim> p2 = geo_cell.getCellGrid(b_int.box.getP2());
+						const grid_key_dx<dim> p1 = geo_cell.getCellGrid(b_int.getP1());
+						const grid_key_dx<dim> p2 = geo_cell.getCellGrid(b_int.getP2());
 
 						// Get the grid and the sub-iterator
 						auto & gi = geo_cell.getGrid();
@@ -400,6 +447,7 @@ protected:
 						{
 							auto key = g_sub.get();
 							geo_cell.addCell(gi.LinId(key),vb_int.size()-1);
+							geo_cell_proc.addCell(gi.LinId(key),p_id);
 							++g_sub;
 						}
 					}
@@ -415,19 +463,19 @@ public:
 	ie_ghost() {};
 
 	//! Copy constructor
-	ie_ghost(const ie_ghost<dim,T> & ie)
+	ie_ghost(const ie_ghost<dim,T,Memory,layout_base> & ie)
 	{
-		this->operator =(ie);
+		this->operator=(ie);
 	}
 
 	//! Copy constructor
-	ie_ghost(ie_ghost<dim,T> && ie)
+	ie_ghost(ie_ghost<dim,T,Memory,layout_base> && ie)
 	{
 		this->operator=(ie);
 	}
 
 	//! Copy operator
-	inline ie_ghost<dim,T> & operator=(ie_ghost<dim,T> && ie)
+	inline ie_ghost<dim,T,Memory,layout_base> & operator=(ie_ghost<dim,T,Memory,layout_base> && ie)
 	{
 		box_nn_processor_int.swap(ie.box_nn_processor_int);
 		proc_int_box.swap(ie.proc_int_box);
@@ -442,7 +490,7 @@ public:
 	}
 
 	//! Copy operator
-	inline ie_ghost<dim,T> & operator=(const ie_ghost<dim,T> & ie)
+	inline ie_ghost<dim,T,Memory,layout_base> & operator=(const ie_ghost<dim,T,Memory,layout_base> & ie)
 	{
 		box_nn_processor_int = ie.box_nn_processor_int;
 		proc_int_box = ie.proc_int_box;
@@ -682,9 +730,9 @@ public:
 	 * \return the internal ghost box
 	 *
 	 */
-	inline const ::Box<dim,T> & getIGhostBox(size_t b_id) const
+	inline ::Box<dim,T> getIGhostBox(size_t b_id) const
 	{
-		return vb_int.get(b_id).box;
+		return vb_int_box.get(b_id);
 	}
 
 	/*! \brief Given the internal ghost box id, it return the near processor at witch belong
@@ -697,7 +745,7 @@ public:
 	 */
 	inline size_t getIGhostBoxProcessor(size_t b_id) const
 	{
-		return vb_int.get(b_id).proc;
+		return vb_int.template get<proc_>(b_id);
 	}
 
 	/*! \brief Get the number of the calculated external ghost boxes
@@ -759,6 +807,16 @@ public:
 		return geo_cell.getCellIterator(geo_cell.getCell(p));
 	}
 
+	/*! \brief Get the number of processor a particle must sent
+	 *
+	 * \param p position of the particle
+	 *
+	 */
+	inline unsigned int ghost_processorID_N(const Point<dim,T> & p)
+	{
+		return ghost_processorID_N_impl(p,geo_cell,vb_int_box,vb_int);
+	}
+
 	/*! \brief Given a position it return if the position belong to any neighborhood processor ghost
 	 * (Internal ghost)
 	 *
@@ -790,7 +848,7 @@ public:
 		{
 			size_t bid = cell_it.get();
 
-			if (vb_int.get(bid).box.isInsideNP(p) == true)
+			if (Box<dim,T>(vb_int_box.get(bid)).isInsideNP(p) == true)
 			{
 				ids_p.add(std::pair<size_t,size_t>(id1::id(vb_int.get(bid),bid),id2::id(vb_int.get(bid),bid)));
 			}
@@ -839,7 +897,7 @@ public:
 		{
 			size_t bid = cell_it.get();
 
-			if (vb_int.get(bid).box.isInsideNP(p) == true)
+			if (Box<dim,T>(vb_int_box.get(bid)).isInsideNP(p) == true)
 			{
 				ids.add(id::id(vb_int.get(bid),bid));
 			}
@@ -883,7 +941,7 @@ public:
 		{
 			size_t bid = cell_it.get();
 
-			if (vb_int.get(bid).box.isInsideNP(p) == true)
+			if (Box<dim,T>(vb_int_box.get(bid)).isInsideNP(p) == true)
 			{
 				ids_p.add(std::pair<size_t,size_t>(id1::id(vb_int.get(bid),bid),id2::id(vb_int.get(bid),bid)));
 			}
@@ -990,7 +1048,7 @@ public:
 	 * \return true if they are equal
 	 *
 	 */
-	bool is_equal(ie_ghost<dim,T> & ig)
+	bool is_equal(ie_ghost<dim,T,Memory,layout_base> & ig)
 	{
 		if (getNEGhostBox() != ig.getNEGhostBox())
 			return false;
@@ -1055,7 +1113,7 @@ public:
 	 * \return true if they are equal
 	 *
 	 */
-	bool is_equal_ng(ie_ghost<dim,T> & ig)
+	bool is_equal_ng(ie_ghost<dim,T,Memory,layout_base> & ig)
 	{
 		return true;
 	}
@@ -1074,6 +1132,27 @@ public:
 		ids_p.clear();
 		ids.clear();
 	}
+
+	/*! \brief toKernel() Convert this data-structure into a kernel usable data-structure
+	 *
+	 * \return
+	 *
+	 */
+	ie_ghost_gpu<dim,T,Memory,layout_base> toKernel()
+	{
+		if (host_dev_transfer == false)
+		{
+			geo_cell.hostToDevice();
+			vb_int_box.template hostToDevice<0,1>();
+			vb_int.template hostToDevice<0,1,2>();
+
+			host_dev_transfer = true;
+		}
+
+		ie_ghost_gpu<dim,T,Memory,layout_base> igg(geo_cell.toKernel(),vb_int_box.toKernel(),vb_int.toKernel(),create_vcluster().size());
+
+		return igg;
+	}
 };
 
 
diff --git a/src/Makefile.am b/src/Makefile.am
index 70e6b590..01822b3f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -17,7 +17,7 @@ pdata_LDADD = $(LINKLIBS) -lparmetis -lmetis
 
 actual_test_SOURCES = Vector/cuda/vector_dist_cuda_func_test.cu Vector/cuda/vector_dist_gpu_unit_tests.cu vector_ main_single.cpp  lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu  ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
 actual_test_CXXFLAGS = -Wno-unknown-pragmas $(BOOST_CPPFLAGS) $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(H5PART_INCLUDE) -DPARALLEL_IO  -Wno-unused-local-typedefs
-actual_test_CFLAGS = $(CUDA_CFLAGS) -IDIOCANE
+actual_test_CFLAGS = $(CUDA_CFLAGS)
 actual_test_LDADD = $(LINKLIBS) -lparmetis -lmetis
 
 
diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu
index 6b3454e7..6d165d69 100644
--- a/src/Vector/cuda/vector_dist_cuda_func_test.cu
+++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu
@@ -4,9 +4,198 @@
 #include "Vector/map_vector.hpp"
 #include "Vector/cuda/vector_dist_cuda_funcs.cuh"
 #include "Vector/util/vector_dist_funcs.hpp"
+#include "Decomposition/CartDecomposition.hpp"
+#include "util/cuda/scan_cuda.cuh"
+
+#define SUB_UNIT_FACTOR 1024
 
 BOOST_AUTO_TEST_SUITE( vector_dist_gpu_util_func_test )
 
+BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
+{
+	auto & v_cl = create_vcluster();
+
+	// Vcluster
+	Vcluster<> & vcl = create_vcluster();
+
+	CartDecomposition<3, float, CudaMemory, memory_traits_inte> dec(vcl);
+
+	// Physical domain
+	Box<3, float> box( { 0.0, 0.0, 0.0 }, { 1.0, 1.0, 1.0 });
+	size_t div[3];
+
+	// Get the number of processor and calculate the number of sub-domain
+	// for each processor (SUB_UNIT_FACTOR=64)
+	size_t n_proc = vcl.getProcessingUnits();
+	size_t n_sub = n_proc * SUB_UNIT_FACTOR;
+
+	// Set the number of sub-domains on each dimension (in a scalable way)
+	for (int i = 0; i < 3; i++)
+	{	div[i] = openfpm::math::round_big_2(pow(n_sub,1.0/3));}
+
+	// Define ghost
+	Ghost<3, float> g(0.1);
+
+	// Boundary conditions
+	size_t bc[] = { PERIODIC, PERIODIC, PERIODIC };
+
+	// Decompose
+	dec.setParameters(div,box,bc,g);
+	dec.decompose();
+
+	// Get the local boxes
+
+	int nsub = dec.getNSubDomain();
+	int n_part = 10000 / nsub;
+
+	openfpm::vector_gpu<Point<3,float>> vg;
+	vg.resize(nsub*n_part);
+
+	for (size_t k = 0 ; k < nsub ; k++)
+	{
+		SpaceBox<3,float> sp = dec.getSubDomain(k);
+
+		for (size_t j = 0 ; j < n_part ; j++)
+		{
+			vg.template get<0>(k*n_part+j)[0] = (sp.getHigh(0) - sp.getLow(0))*((float)rand()/RAND_MAX) + sp.getLow(0);
+			vg.template get<0>(k*n_part+j)[1] = (sp.getHigh(1) - sp.getLow(1))*((float)rand()/RAND_MAX) + sp.getLow(1);
+			vg.template get<0>(k*n_part+j)[2] = (sp.getHigh(2) - sp.getLow(2))*((float)rand()/RAND_MAX) + sp.getLow(2);
+		}
+	}
+
+	vg.hostToDevice<0>();
+
+	// process on GPU the processor ID for each particles
+
+	auto ite = vg.getGPUIterator();
+
+	openfpm::vector_gpu<aggregate<unsigned int>> proc_id_out;
+	proc_id_out.resize(vg.size()+1);
+	proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
+	proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);
+
+	num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
+	<<<ite.wthr,ite.thr>>>
+	(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel());
+
+	proc_id_out.deviceToHost<0>();
+
+	bool match = true;
+	for (size_t i = 0 ; i < vg.size() ; i++)
+	{
+		Point<3,float> xp = vg.template get<0>(i);
+
+		match &= proc_id_out.template get<0>(i) == dec.ghost_processorID_N(xp);
+	}
+
+	BOOST_REQUIRE_EQUAL(match,true);
+
+	////////////////////////// We now create the scan //////////////////////////////////////
+
+    openfpm::vector<aggregate<unsigned int>,
+                    CudaMemory,
+                    typename memory_traits_inte<aggregate<unsigned int>>::type,
+                    memory_traits_inte> starts;
+
+    starts.resize(proc_id_out.size());
+
+	// scan
+	scan<unsigned int,unsigned int>(proc_id_out,starts);
+	starts.deviceToHost<0>(starts.size()-1,starts.size()-1);
+
+	size_t sz = starts.template get<0>(starts.size()-1);
+
+	///////////////////////// We now test //////////////////////////
+
+    openfpm::vector<aggregate<unsigned int,unsigned int>,
+                    CudaMemory,
+                    typename memory_traits_inte<aggregate<unsigned int,unsigned int>>::type,
+                    memory_traits_inte> output;
+
+    output.resize(sz);
+
+	ite = vg.getGPUIterator();
+
+	// we compute processor id for each particle
+	proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())>
+	<<<ite.wthr,ite.thr>>>
+	(dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel());
+
+	output.template deviceToHost<0,1>();
+
+	for (size_t i = 0 ; i < output.size() ; i++)
+	{
+		std::cout << output.template get<0>(i) << "   " << output.template get<1>(i) << std::endl;
+	}
+}
+
+BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
+{
+	auto & v_cl = create_vcluster();
+
+	// Vcluster
+	Vcluster<> & vcl = create_vcluster();
+
+	CartDecomposition<3, float, CudaMemory, memory_traits_inte> dec(vcl);
+
+	// Physical domain
+	Box<3, float> box( { 0.0, 0.0, 0.0 }, { 1.0, 1.0, 1.0 });
+	size_t div[3];
+
+	// Get the number of processor and calculate the number of sub-domain
+	// for each processor (SUB_UNIT_FACTOR=64)
+	size_t n_proc = vcl.getProcessingUnits();
+	size_t n_sub = n_proc * SUB_UNIT_FACTOR;
+
+	// Set the number of sub-domains on each dimension (in a scalable way)
+	for (int i = 0; i < 3; i++)
+	{	div[i] = openfpm::math::round_big_2(pow(n_sub,1.0/3));}
+
+	// Define ghost
+	Ghost<3, float> g(0.01);
+
+	// Boundary conditions
+	size_t bc[] = { PERIODIC, PERIODIC, PERIODIC };
+
+	// Decompose
+	dec.setParameters(div,box,bc,g);
+	dec.decompose();
+
+	openfpm::vector_gpu<Point<3,float>> vg;
+	vg.resize(10000);
+
+	for (size_t i = 0 ; i < 10000 ; i++)
+	{
+		vg.template get<0>(i)[0] = (float)rand()/RAND_MAX;
+		vg.template get<0>(i)[1] = (float)rand()/RAND_MAX;
+		vg.template get<0>(i)[2] = (float)rand()/RAND_MAX;
+	}
+
+	vg.hostToDevice<0>();
+
+	// process on GPU the processor ID for each particles
+
+	auto ite = vg.getGPUIterator();
+
+	openfpm::vector_gpu<aggregate<int,int>> proc_id_out;
+	proc_id_out.resize(vg.size());
+
+	process_id_proc_each_part<decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
+	<<<ite.wthr,ite.thr>>>
+	(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),v_cl.rank());
+
+	proc_id_out.deviceToHost<0>();
+
+	bool match = true;
+	for (size_t i = 0 ; i < proc_id_out.size() ; i++)
+	{
+		Point<3,float> xp = vg.template get<0>(i);
+
+		match &= proc_id_out.template get<0>(i) == dec.processorIDBC(xp);
+	}
+}
+
+
 BOOST_AUTO_TEST_CASE( vector_dist_gpu_find_buffer_offsets_test )
 {
 	openfpm::vector_gpu<aggregate<int,int>> vgp;
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index 7c3b3c6d..31dfc6e4 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -10,6 +10,47 @@
 
 #include "Vector/util/vector_dist_funcs.hpp"
 
+template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
+__global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
+{
+	int p = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (p >= vd.size()) return;
+
+    Point<dim,St> xp = vd.template get<0>(p);
+
+    unsigned int base = starts.template get<0>(p);
+
+    dec.ghost_processor_ID(xp,out,base,p);
+}
+
+template<unsigned int dim, typename St, typename decomposition_type, typename vector_type,  typename output_type>
+__global__ void num_proc_ghost_each_part(decomposition_type dec, vector_type vd,  output_type out)
+{
+	int p = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (p >= vd.size()) return;
+
+    Point<dim,St> xp = vd.template get<0>(p);
+
+    out.template get<0>(p) = dec.ghost_processorID_N(xp);
+}
+
+template<typename cartdec_gpu, typename particles_type, typename vector_out>
+__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output , int rank)
+{
+    int p = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (p >= parts.size()) return;
+
+	Point<3,float> xp = parts.template get<0>(p);
+
+	int pr = cdg.processorIDBC(xp);
+
+	output.template get<1>(p) = (pr == rank)?-1:pr;
+	output.template get<0>(p) = p;
+}
+
 template<typename vector_type,typename vector_type_offs>
 __global__  void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs offs)
 {
diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
index 26dbe011..b3886ee4 100644
--- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu
@@ -5,6 +5,8 @@
 #include <Vector/vector_dist.hpp>
 #include "Vector/tests/vector_dist_util_unit_tests.hpp"
 
+#define SUB_UNIT_FACTOR 1024
+
 BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
 
 void print_test(std::string test, size_t sz)
@@ -375,7 +377,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
 
 }
 
-
 BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
 {
 	auto & v_cl = create_vcluster();
@@ -421,7 +422,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
 	vd.hostToDeviceProp<0,1,2>();
 
 	// Ok we redistribute the particles (GPU based)
-	vd.map(MAP_ON_DEVICE);
+	vd.map(RUN_ON_DEVICE);
 
 	// Reset the host part
 
@@ -494,6 +495,8 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
 	BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
 
 	vd.write("gpu_write_test");
+
+	vd.ghost_get<0,1,2>(RUN_ON_DEVICE);
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index 05f15a77..bd73bc48 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -2394,7 +2394,7 @@ public:
 	 * \return Particle iterator
 	 *
 	 */
-	template<typename vrl> openfpm::vector_key_iterator_seq<typename vrl::Mem_type_type::loc_index> getParticleIteratorCRS(vrl & NN)
+	template<typename vrl> openfpm::vector_key_iterator_seq<typename vrl::Mem_type_type::local_index_type> getParticleIteratorCRS(vrl & NN)
 	{
 #ifdef SE_CLASS1
 		if (!(opt & BIND_DEC_TO_GHOST))
@@ -2405,7 +2405,7 @@ public:
 #endif
 
 		// First we check that
-		return openfpm::vector_key_iterator_seq<typename vrl::Mem_type_type::loc_index>(NN.getParticleSeq());
+		return openfpm::vector_key_iterator_seq<typename vrl::Mem_type_type::local_index_type>(NN.getParticleSeq());
 	}
 
 	/*! \brief Return from which cell we have to start in case of CRS interation
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index 4419fa2c..bd07859e 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -11,6 +11,7 @@
 #if defined(CUDA_GPU) && defined(__NVCC__)
 #include "util/cuda/moderngpu/kernel_mergesort.hxx"
 #include "Vector/cuda/vector_dist_cuda_funcs.cuh"
+#include "util/cuda/scan_cuda.cuh"
 #endif
 
 #include "Vector/util/vector_dist_funcs.hpp"
@@ -24,7 +25,7 @@
 
 #define BIND_DEC_TO_GHOST 1
 
-#define MAP_ON_DEVICE 1024
+#define RUN_ON_DEVICE 1024
 #define MAP_LOCAL 2
 
 /*! \brief compute the communication options from the ghost_get/put options
@@ -164,7 +165,7 @@ class vector_dist_comm
 								  openfpm::vector<size_t> & prc_r,
 								  size_t opt)
 	{
-		if (opt & MAP_ON_DEVICE)
+		if (opt & RUN_ON_DEVICE)
 		{
 			size_t prev_off = 0;
 			for (size_t i = 0; i < prc_sz.size() ; i++)
@@ -673,7 +674,7 @@ class vector_dist_comm
 	 * \param m_pos sending buffer for position
 	 * \param m_prp sending buffer for properties
 	 * \param offset from where start the list of the particles that migrate in o_part
-	 *        This parameter is used only in case of MAP_ON_DEVICE option
+	 *        This parameter is used only in case of RUN_ON_DEVICE option
 	 *
 	 */
 	void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
@@ -696,7 +697,7 @@ class vector_dist_comm
 			cnt.get(i) = 0;
 		}
 
-		if (opt & MAP_ON_DEVICE)
+		if (opt & RUN_ON_DEVICE)
 		{
 #if defined(CUDA_GPU) && defined(__NVCC__)
 
@@ -736,7 +737,7 @@ class vector_dist_comm
 
 #else
 
-			std::cout << __FILE__ << ":" << __LINE__ << " error MAP_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
+			std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
 
 #endif
 		}
@@ -823,7 +824,7 @@ class vector_dist_comm
 			                                           openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_sz,
 			                                           size_t opt)
 	{
-		if (opt == MAP_ON_DEVICE)
+		if (opt == RUN_ON_DEVICE)
 		{
 #ifdef __NVCC__
 
@@ -855,6 +856,7 @@ class vector_dist_comm
 			// get also the last element from lbl_p;
 			lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1);
 
+			mem.deviceToHost();
 			int noff = *(int *)mem.getPointer();
 			prc_sz.resize(noff+1);
 			prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size();
@@ -862,7 +864,7 @@ class vector_dist_comm
 
 #else
 
-			std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with MAP_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
+			std::cout << __FILE__ << ":" << __LINE__ << " error, it seems you tried to call map with RUN_ON_DEVICE option, this requires to compile the program with NVCC" << std::endl;
 
 #endif
 		}
@@ -930,57 +932,98 @@ class vector_dist_comm
 	 * \param v_prp vector of particle properties
 	 * \param prc for each particle it label the processor id (the owner of the particle, or where it should go the particle)
 	 * \param g_m ghost marker
+	 * \param opt ghost_get options
 	 *
 	 */
 	void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
 			                 openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
 			                 openfpm::vector<size_t> & prc,
-			                 size_t & g_m)
+			                 size_t & g_m,
+			                 size_t opt)
 	{
 		// Buffer that contain for each processor the id of the particle to send
 		g_opart.clear();
 		g_opart.resize(dec.getNNProcessors());
 		prc_g_opart.clear();
 
-		// Iterate over all particles
-		auto it = v_pos.getIteratorTo(g_m);
-		while (it.isNext())
+		if (opt & RUN_ON_DEVICE)
 		{
-			auto key = it.get();
+#if defined(CUDA_GPU) && defined(__NVCC__)
 
-			// Given a particle, it return which processor require it (first id) and shift id, second id
-			// For an explanation about shifts vectors please consult getShiftVector in ie_ghost
-			const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE);
+            openfpm::vector<aggregate<unsigned int>,
+                            Memory,
+                            typename layout_base<aggregate<unsigned int>>::type,
+                            layout_base> proc_id_out;
+			proc_id_out.resize(v_pos.size());
 
-			for (size_t i = 0; i < vp_id.size(); i++)
-			{
-				// processor id
-				size_t p_id = vp_id.get(i).first;
+			auto ite = v_pos.getGPUIterator();
 
-				// add particle to communicate
-				g_opart.get(p_id).add();
-				g_opart.get(p_id).last().template get<0>() = key;
-				g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
-			}
+			// First we have to see how many entry each particle produce
+			num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
+			<<<ite.wthr,ite.thr>>>
+			(dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
 
-			++it;
-		}
+            openfpm::vector<aggregate<unsigned int>,
+                            Memory,
+                            typename layout_base<aggregate<unsigned int>>::type,
+                            layout_base> starts;
+
+			// scan
+			scan<unsigned int,unsigned int>(proc_id_out,starts);
+
+			// we compute processor id for each particle
 
-		// remove all zero entry and construct prc (the list of the sending processors)
-		openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart_f;
 
-		// count the non zero element
-		for (size_t i = 0 ; i < g_opart.size() ; i++)
+			// we do a sort
+
+#else
+
+			std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;
+
+#endif
+		}
+		else
 		{
-			if (g_opart.get(i).size() != 0)
+			// Iterate over all particles
+			auto it = v_pos.getIteratorTo(g_m);
+			while (it.isNext())
+			{
+				auto key = it.get();
+
+				// Given a particle, it return which processor require it (first id) and shift id, second id
+				// For an explanation about shifts vectors please consult getShiftVector in ie_ghost
+				const openfpm::vector<std::pair<size_t, size_t>> & vp_id = dec.template ghost_processorID_pair<typename Decomposition::lc_processor_id, typename Decomposition::shift_id>(v_pos.get(key), UNIQUE);
+
+				for (size_t i = 0; i < vp_id.size(); i++)
+				{
+					// processor id
+					size_t p_id = vp_id.get(i).first;
+
+					// add particle to communicate
+					g_opart.get(p_id).add();
+					g_opart.get(p_id).last().template get<0>() = key;
+					g_opart.get(p_id).last().template get<1>() = vp_id.get(i).second;
+				}
+
+				++it;
+			}
+
+			// remove all zero entry and construct prc (the list of the sending processors)
+			openfpm::vector<openfpm::vector<aggregate<size_t,size_t>>> g_opart_f;
+
+			// count the non zero element
+			for (size_t i = 0 ; i < g_opart.size() ; i++)
 			{
-				g_opart_f.add();
-				g_opart.get(i).swap(g_opart_f.last());
-				prc.add(dec.IDtoProc(i));
+				if (g_opart.get(i).size() != 0)
+				{
+					g_opart_f.add();
+					g_opart.get(i).swap(g_opart_f.last());
+					prc.add(dec.IDtoProc(i));
+				}
 			}
-		}
 
-		g_opart.swap(g_opart_f);
+			g_opart.swap(g_opart_f);
+		}
 	}
 
 	/*! \brief Call-back to allocate buffer to receive incoming elements (particles)
@@ -1159,7 +1202,7 @@ public:
 
 		// Label all the particles
 		if ((opt & SKIP_LABELLING) == false)
-		{labelParticlesGhost(v_pos,v_prp,prc_g_opart,g_m);}
+		{labelParticlesGhost(v_pos,v_prp,prc_g_opart,g_m,opt);}
 
 		// Send and receive ghost particle information
 		{
@@ -1344,14 +1387,14 @@ public:
 		fill_send_map_buf(v_pos,v_prp, prc_sz_r, m_pos, m_prp,prc_sz,opt);
 
 		size_t opt_ = 0;
-		if (opt & MAP_ON_DEVICE)
+		if (opt & RUN_ON_DEVICE)
 		{
 #if defined(CUDA_GPU) && defined(__NVCC__)
-			// Before doing the communication on MAP_ON_DEVICE we have to be sure that the previous kernels complete
+			// Before doing the communication on RUN_ON_DEVICE we have to be sure that the previous kernels complete
 			cudaDeviceSynchronize();
 			opt_ |= MPI_GPU_DIRECT;
 #else
-			std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option MAP_ON_DEVICE you must compile with NVCC" << std::endl;
+			std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
 #endif
 		}
 
diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp
new file mode 100644
index 00000000..2155a26e
--- /dev/null
+++ b/src/Vector/vector_dist_kernel.hpp
@@ -0,0 +1,114 @@
+/*
+ * vector_dist_gpu.hpp
+ *
+ *  Created on: Jul 28, 2018
+ *      Author: i-bird
+ */
+
+#ifndef VECTOR_DIST_GPU_HPP_
+#define VECTOR_DIST_GPU_HPP_
+
+#ifdef CUDA_GPU
+
+#define POS_PROP -1
+
+#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x > vd.size()) {return;};
+
+template<unsigned int dim,
+         typename St,
+         typename prop>
+class vector_dist_ker
+{
+	//! Ghost marker, all the particle with id > g_m are ghost all with g_m < are real particle
+	int g_m = 0;
+
+	//! Particle position vector, (It has 2 elements) the first has real particles assigned to a processor
+	//! the second element contain unassigned particles
+	openfpm::vector_gpu_ker<Point<dim,St>,memory_traits_inte> v_pos;
+
+	//! Particle properties vector, (It has 2 elements) the first has real particles assigned to a processor
+	//! the second element contain unassigned particles
+	openfpm::vector_gpu_ker<prop,memory_traits_inte> v_prp;
+
+public:
+
+	//! space type
+	typedef St stype;
+
+	//! dimensions of space
+	static const unsigned int dims = dim;
+
+	vector_dist_ker(const openfpm::vector_gpu_ker<Point<dim,St>,memory_traits_inte> & v_pos, const openfpm::vector_gpu_ker<prop,memory_traits_inte> & v_prp)
+	:v_pos(v_pos),v_prp(v_prp)
+	{}
+
+	/*! \brief return the number of particles
+	 *
+	 * \return the number of particles
+	 *
+	 */
+	__device__ int size() {return v_pos.size();}
+
+	/*! \brief Get the position of an element
+	 *
+	 * see the vector_dist iterator usage to get an element key
+	 *
+	 * \param vec_key element
+	 *
+	 * \return the position of the element in space
+	 *
+	 */
+	__device__ inline auto getPos(int vec_key) -> decltype(v_pos.template get<0>(vec_key))
+	{
+		return v_pos.template get<0>(vec_key);
+	}
+
+	/*! \brief Get the position of an element
+	 *
+	 * see the vector_dist iterator usage to get an element key
+	 *
+	 * \param vec_key element
+	 *
+	 * \return the position of the element in space
+	 *
+	 */
+	__device__ inline auto getPos(int vec_key) const -> decltype(v_pos.template get<0>(vec_key))
+	{
+		return v_pos.template get<0>(vec_key);
+	}
+
+	/*! \brief Get the property of an element
+	 *
+	 * see the vector_dist iterator usage to get an element key
+	 *
+	 * \tparam id property id
+	 * \param vec_key vector element
+	 *
+	 * \return return the selected property of the vector element
+	 *
+	 */
+	template<unsigned int id> __device__  inline auto getProp(int vec_key) -> decltype(v_prp.template get<id>(vec_key))
+	{
+		return v_prp.template get<id>(vec_key);
+	}
+
+	/*! \brief Get the property of an element
+	 *
+	 * see the vector_dist iterator usage to get an element key
+	 *
+	 * \tparam id property id
+	 * \param vec_key vector element
+	 *
+	 * \return return the selected property of the vector element
+	 *
+	 */
+	template<unsigned int id> __device__  inline auto getProp(int vec_key) const -> decltype(v_prp.template get<id>(vec_key))
+	{
+		return v_prp.template get<id>(vec_key);
+	}
+
+};
+
+#endif
+
+#endif /* VECTOR_DIST_GPU_HPP_ */
diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in
new file mode 100644
index 00000000..a0d62c9e
--- /dev/null
+++ b/src/config/config_cmake.h.in
@@ -0,0 +1,159 @@
+/* Coverty scan */
+${DEFINE_COVERTY_SCAN}
+
+/* GPU support */
+${DEFINE_CUDA_GPU}
+
+/* Debug */
+${DEFINE_DEBUG} /**/
+
+/* Debug */
+${DEFINE_DEBUG_MODE} /**/
+
+/* Define to dummy `main' function (if any) required to link to the Fortran
+   libraries. */
+${DEFINE_F77_DUMMY_MAIN}
+
+/* Define if F77 and FC dummy `main' functions are identical. */
+${DEFINE_FC_DUMMY_MAIN_EQ_F77}
+
+/* Define if you have a BLAS library. */
+${DEFINE_HAVE_BLAS}
+
+/* define if the Boost library is available */
+${DEFINE_HAVE_BOOST}
+
+/* define if the Boost::IOStreams library is available */
+${DEFINE_HAVE_BOOST_IOSTREAMS} /**/
+
+/* define if the Boost::PROGRAM_OPTIONS library is available */
+${DEFINE_HAVE_BOOST_PROGRAM_OPTIONS} /**/
+
+/* define if the Boost::Unit_Test_Framework library is available */
+${DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK} /**/
+
+/* Have clock time */
+${DEFINE_HAVE_CLOCK_GETTIME} /**/
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+${DEFINE_HAVE_DLFCN_H}
+
+/* Define if you have EIGEN library. */
+${DEFINE_HAVE_EIGEN}
+
+/* Define to 1 if you have the <Eigen/Dense> header file. */
+${DEFINE_HAVE_EIGEN_DENSE}
+
+/* Define to 1 if you have the <Eigen/LU> header file. */
+${DEFINE_HAVE_EIGEN_LU}
+
+/* Defined if you have HDF5 support */
+${DEFINE_HAVE_HDF5}
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+${DEFINE_HAVE_INTTYPES_H}
+
+/* Define if you have LAPACK library */
+${DEFINE_HAVE_LAPACK}
+
+/* Define if you have LIBHILBERT library */
+${DEFINE_HAVE_LIBHILBERT}
+
+/* Have quad math lib */
+${DEFINE_HAVE_LIBQUADMATH}
+
+/* Define to 1 if you have the <memory.h> header file. */
+${DEFINE_HAVE_MEMORY_H}
+
+/* Define if you have METIS library */
+${DEFINE_HAVE_METIS}
+
+/* MPI Enabled */
+${DEFINE_HAVE_MPI}
+
+/* We have OSX */
+${DEFINE_HAVE_OSX}
+
+/* Define if you have PARMETIS library */
+${DEFINE_HAVE_PARMETIS}
+
+/* Define if you have PETSC library */
+${DEFINE_HAVE_PETSC}
+
+/* Define to 1 if you have the <stdint.h> header file. */
+${DEFINE_HAVE_STDINT_H}
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+${DEFINE_HAVE_STDLIB_H}
+
+/* Define to 1 if you have the <strings.h> header file. */
+${DEFINE_HAVE_STRINGS_H}
+
+/* Define to 1 if you have the <string.h> header file. */
+${DEFINE_HAVE_STRING_H}
+
+/* Define if you have SUITESPARSE library. */
+${DEFINE_HAVE_SUITESPARSE}
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+${DEFINE_HAVE_SYS_STAT_H}
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+${DEFINE_HAVE_SYS_TYPES_H}
+
+/* Define to 1 if you have the <unistd.h> header file. */
+${DEFINE_HAVE_UNISTD_H}
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#define LT_OBJDIR ".libs/"
+
+/* NVCC compiling */
+${DEFINE_NVCC} /**/
+
+/* Name of package */
+#define PACKAGE "openfpm_pdata"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "BUG-REPORT-ADDRESS"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "OpenFPM_pdata"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "OpenFPM_pdata 1.0.0"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "openfpm_pdata"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.0.0"
+
+/* Test performance mode */
+${DEFINE_PERFORMANCE_TEST}
+
+/* Security enhancement class 1 */
+${DEFINE_SE_CLASS1}
+
+/* Security enhancement class 2 */
+${DEFINE_SE_CLASS2}
+
+/* Security enhancement class 3 */
+${DEFINE_SE_CLASS3}
+
+/* Define to 1 if you have the ANSI C header files. */
+${DEFINE_STDC_HEADERS}
+
+/* If an error occur stop the program */
+${DEFINE_STOP_ON_ERROR}
+
+/* Test coverage mode */
+${DEFINE_TEST_COVERAGE_MODE}
+
+/* when an error accur continue but avoid unsafe operation */
+/* #undef THROW_ON_ERROR */
+
+/* Version number of package */
+#define VERSION "1.0.0"
diff --git a/src/main_single.cpp b/src/main_single.cpp
new file mode 100644
index 00000000..ac34002d
--- /dev/null
+++ b/src/main_single.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+
+size_t debug_tot_call = 0;
+
+#define PRINT_STACKTRACE
+#define CHECKFOR_POSNAN
+#define CHECKFOR_POSINF
+#define CHECKFOR_PROPNAN
+#define CHECKFOR_PROPINF
+
+#define BOOST_DISABLE_ASSERTS
+
+
+#include "config.h"
+#undef VERSION
+
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+// initialization function:
+bool init_unit_test()
+{
+  return true;
+}
+
+// entry point:
+int main(int argc, char* argv[])
+{
+  return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
+}
+
+#include "unit_test_init_cleanup.hpp"
-- 
GitLab