From b45058b1122bae59e88a1ccb07269d94405fb591 Mon Sep 17 00:00:00 2001
From: Serhii Yaskovets <>
Date: Thu, 27 Apr 2023 11:46:03 +0200
Subject: [PATCH] Revert "Revert "Merge remote-tracking branch

This reverts commit 99cc3f75f8a17deba4bc67774e6c0bc9582a5c52.
 src/CMakeLists.txt                            |   30 +-
 src/DCPSE/DCPSE_op/DCPSE_Solver.cuh           |  801 ++++++++++
 src/DCPSE/DCPSE_op/DCPSE_Solver.hpp           |  325 +++-
 src/DCPSE/DCPSE_op/DCPSE_op.hpp               | 1009 ++++++++++---
 src/DCPSE/DCPSE_op/DCPSE_surface_op.hpp       | 1008 +++++++++++++
 src/DCPSE/DCPSE_op/EqnsStruct.hpp             |  738 ++++++++-
 .../DCPSE_op/tests/DCPSE_op_Solver_test.cpp   |  188 ++-
 .../DCPSE_op/tests/    | 1333 +++++++++++++++++
 .../DCPSE_op/tests/DCPSE_op_Surface_tests.cpp | 1112 ++++++++++++++
 .../DCPSE_op/tests/    |  616 ++++++++
 src/DCPSE/DCPSE_op/tests/DCPSE_op_test3d.cpp  |  350 ++++-
 .../tests/DCPSE_op_test_base_tests.cpp        |  290 +++-
 src/DCPSE/Dcpse.cuh                           | 1031 +++++++++++++
 src/DCPSE/Dcpse.hpp                           |  626 ++++++--
 src/DCPSE/DcpseDiagonalScalingMatrix.hpp      |   25 +-
 src/DCPSE/DcpseInterpolation.hpp              |  107 ++
 src/DCPSE/Monomial.cuh                        |  204 +++
 src/DCPSE/Monomial.hpp                        |    1 +
 src/DCPSE/MonomialBasis.hpp                   |  118 +-
 src/DCPSE/Support.hpp                         |   45 +-
 src/DCPSE/SupportBuilder.cuh                  |  146 ++
 src/DCPSE/SupportBuilder.hpp                  |  364 +++--
 src/DCPSE/Vandermonde.hpp                     |   51 +-
 src/DCPSE/VandermondeRowBuilder.hpp           |   20 +-
 src/DCPSE/tests/Support_unit_tests.cpp        |   34 +-
 src/DCPSE/tests/Vandermonde_unit_tests.cpp    |    4 +-
 src/FiniteDifference/FD_expressions.hpp       |  798 +++++++++-
 src/FiniteDifference/FD_op_Tests.cpp          |   99 ++
 src/Matrix/SparseMatrix_petsc.hpp             |    7 +-
 src/OdeIntegrators/OdeIntegrators.hpp         |  205 ++-
 .../tests/OdeIntegrator_grid_tests.cpp        |  576 +++++++
 .../tests/OdeIntegratores_base_tests.cpp      |   19 +-
 .../tests/          |  104 ++
 ...algebra_ofp.hpp => vector_algebra_ofp.hpp} |  792 +++++-----
 src/OdeIntegrators/vector_algebra_ofp_gpu.hpp |  993 ++++++++++++
 .../cuda/vector_dist_operators_cuda.cuh       |    9 +-
 .../Vector/vector_dist_operators.hpp          |  586 +++++---
 src/Solvers/petsc_solver.hpp                  |  199 ++-
 .../interpolation_unit_tests.cpp              |  443 ++++++
 src/interpolation/lambda_kernel.hpp           |   39 +-
 src/level_set/closest_point/closest_point.hpp |  226 ++-
 .../closest_point_unit_tests.cpp              |  185 ++-
 src/util/SphericalHarmonics.hpp               |   19 +-
 43 files changed, 14284 insertions(+), 1591 deletions(-)
 create mode 100644 src/DCPSE/DCPSE_op/DCPSE_Solver.cuh
 create mode 100644 src/DCPSE/DCPSE_op/DCPSE_surface_op.hpp
 create mode 100644 src/DCPSE/DCPSE_op/tests/
 create mode 100644 src/DCPSE/DCPSE_op/tests/DCPSE_op_Surface_tests.cpp
 create mode 100644 src/DCPSE/DCPSE_op/tests/
 create mode 100644 src/DCPSE/Dcpse.cuh
 create mode 100644 src/DCPSE/DcpseInterpolation.hpp
 create mode 100644 src/DCPSE/Monomial.cuh
 create mode 100644 src/DCPSE/SupportBuilder.cuh
 create mode 100644 src/OdeIntegrators/tests/OdeIntegrator_grid_tests.cpp
 create mode 100644 src/OdeIntegrators/tests/
 rename src/OdeIntegrators/{boost_vector_algebra_ofp.hpp => vector_algebra_ofp.hpp} (86%)
 create mode 100644 src/OdeIntegrators/vector_algebra_ofp_gpu.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 37f1b92b..c7589304 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,6 +17,9 @@ endif()
 	set(CUDA_SOURCES Operators/Vector/
+					 #DCPSE/DCPSE_op/tests/
+			 	 	 OdeIntegrators/tests/
+					 #DCPSE/DCPSE_op/tests/
         hip_add_executable(numerics ${OPENFPM_INIT_FILE} ${CUDA_SOURCES}
-                OdeIntegrators/tests/OdeIntegratores_base_tests.cpp
+				OdeIntegrators/tests/OdeIntegratores_base_tests.cpp
+				OdeIntegrators/tests/OdeIntegrator_grid_tests.cpp
@@ -74,6 +78,7 @@ else()
 	add_executable(numerics ${OPENFPM_INIT_FILE} ${CUDA_SOURCES}
+		OdeIntegrators/tests/OdeIntegrator_grid_tests.cpp
@@ -100,6 +105,7 @@ else()
+			DCPSE/DCPSE_op/tests/DCPSE_op_Surface_tests.cpp
 #			BoundaryConditions/tests/method_of_images_cylinder_unit_test.cpp
 #		    level_set/closest_point/closest_point_unit_tests.cpp
@@ -141,6 +147,7 @@ if(CUDA_FOUND)
         if (TEST_COVERAGE)
                 target_compile_options(numerics PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
+    target_link_libraries(numerics -lcublas)
 target_include_directories (numerics PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
@@ -157,6 +164,8 @@ target_include_directories (numerics PUBLIC ${HDF5_ROOT}/include)
 target_include_directories (numerics PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
 target_include_directories (numerics PUBLIC ${Boost_INCLUDE_DIRS})
 target_include_directories (numerics PUBLIC ${Vc_INCLUDE_DIR})
+target_include_directories (numerics PUBLIC ${BLITZ_ROOT}/include)
+target_include_directories (numerics PUBLIC ${ALGOIM_ROOT}/include)
 target_include_directories (numerics PUBLIC ${ALPAKA_ROOT}/include)
 target_include_directories (numerics PUBLIC ${MPI_C_INCLUDE_DIRS})
@@ -243,7 +252,7 @@ install(FILES FiniteDifference/Average.hpp
-      	      FiniteDifference/sum.hpp 
+      	      FiniteDifference/sum.hpp
@@ -271,18 +280,23 @@ install(FILES Operators/Vector/vector_dist_operators_extensions.hpp
 	       COMPONENT OpenFPM)
 install(FILES Operators/Vector/cuda/vector_dist_operators_cuda.cuh
-	      DESTINATION openfpm_numerics/include/Operators/Vector/cuda 
+	      DESTINATION openfpm_numerics/include/Operators/Vector/cuda
 	      COMPONENT OpenFPM)
 install(FILES DCPSE/Dcpse.hpp
+		DCPSE/Dcpse.cuh
+		DCPSE/DCPSE_op/DCPSE_Solver.cuh
+		DCPSE/Monomial.cuh
+		DCPSE/SupportBuilder.cuh
+		DCPSE/DcpseInterpolation.hpp
 		DESTINATION openfpm_numerics/include/DCPSE
@@ -297,11 +311,13 @@ install(FILES util/eq_solve_common.hpp
 install(FILES DCPSE/DCPSE_op/DCPSE_op.hpp
+		DCPSE/DCPSE_op/DCPSE_surface_op.hpp
 		DESTINATION openfpm_numerics/include/DCPSE/DCPSE_op
 install(FILES OdeIntegrators/OdeIntegrators.hpp
-		OdeIntegrators/boost_vector_algebra_ofp.hpp
+        OdeIntegrators/vector_algebra_ofp.hpp
+		OdeIntegrators/vector_algebra_ofp_gpu.hpp
 		DESTINATION openfpm_numerics/include/OdeIntegrators
@@ -310,7 +326,7 @@ install(FILES Draw/DrawParticles.hpp
-      	      DESTINATION openfpm_numerics/include/Draw 
+      	      DESTINATION openfpm_numerics/include/Draw
 	      COMPONENT OpenFPM)
 install(FILES interpolation/interpolation.hpp 
@@ -338,6 +354,10 @@ install(FILES DMatrix/EMatrix.hpp
 	DESTINATION openfpm_numerics/include/DMatrix
+install(FILES level_set/closest_point/closest_point.hpp
+	      DESTINATION openfpm_numerics/include/level_set/closest_point
+	      COMPONENT OpenFPM)
 #  add_executable(particle_test
diff --git a/src/DCPSE/DCPSE_op/DCPSE_Solver.cuh b/src/DCPSE/DCPSE_op/DCPSE_Solver.cuh
new file mode 100644
index 00000000..8045214f
--- /dev/null
+++ b/src/DCPSE/DCPSE_op/DCPSE_Solver.cuh
@@ -0,0 +1,801 @@
+// Created by Serhii
+// #include "DCPSE_op.hpp"
+#include "DCPSE/DCPSE_op/DCPSE_op.hpp"
+#include "Matrix/SparseMatrix.hpp"
+#include "Vector/Vector.hpp"
+#include "NN/CellList/CellDecomposer.hpp"
+#include "Vector/Vector_util.hpp"
+#include "Vector/vector_dist.hpp"
+#include "Solvers/umfpack_solver.hpp"
+#include "Solvers/petsc_solver.hpp"
+#include "util/eq_solve_common.hpp"
+/*enum eq_struct
+//template<unsigned int prp_id> using prop_id = boost::mpl::int_<prp_id>;
+/*! \brief Create a Matrix System for Ax=b
+ *
+ * This Class is for creating a placeholder for the matrix system
+ *
+ * Ax = b
+ *
+ *
+ * \param Sys_eqs Equation Structure which has information about the system. Refer to EqnStruct.cpp for examples
+ * \param parts Particle set
+ *
+ */
+template<typename Sys_eqs, typename particles_type>
+class DCPSE_scheme_gpu {
+    //! type of the sparse matrix
+    typename Sys_eqs::SparseMatrix_type A;
+    //! Vector b
+    typename Sys_eqs::Vector_type b;
+    //! Sparse matrix triplet type
+    typedef typename Sys_eqs::SparseMatrix_type::triplet_type triplet;
+    //! Distributed grid map
+    typedef vector_dist_gpu<Sys_eqs::dims, typename Sys_eqs::stype, aggregate<size_t>> p_map_type;
+    //! mapping grid
+    p_map_type p_map;
+    //! Grid points that has each processor
+    openfpm::vector<size_t> pnt;
+    //! Particles used to impose the system
+    particles_type &parts;
+    //! colums shift map
+    //int col_sm[Sys_eqs::nvar];
+    //! Each point in the grid has a global id, to decompose correctly the Matrix each processor contain a
+    //! contiguos range of global id, example processor 0 can have from 0 to 234 and processor 1 from 235 to 512
+    //! no processors can have holes in the sequence, this number indicate where the sequence start for this
+    //! processor
+    size_t s_pnt;
+    //! row of the matrix
+    size_t row;
+    //! row on b
+    size_t row_b;
+    //! Total number of points
+    size_t tot;
+    //! solver options
+    options_solver opt;
+    size_t offset;
+    /*! \brief Construct the gmap structure
+ *
+ */template<typename options>
+    void construct_pmap(options opt = options_solver::STANDARD) {
+        Vcluster<> &v_cl = create_vcluster();
+        // Calculate the size of the local domain
+        size_t sz = p_map.size_local();
+        // Get the total size of the local grids on each processors
+        v_cl.allGather(sz, pnt);
+        v_cl.execute();
+        s_pnt = 0;
+        // calculate the starting point for this processor
+        for (size_t i = 0; i < v_cl.getProcessUnitID(); i++)
+            s_pnt += pnt.get(i);
+        tot = sz;
+        v_cl.sum(tot);
+        v_cl.execute();
+        // resize b if needed
+        if (opt == options_solver::STANDARD) {
+            b.resize(Sys_eqs::nvar * tot, Sys_eqs::nvar * sz);
+        } else if (opt == options_solver::LAGRANGE_MULTIPLIER) {
+            if (v_cl.rank() == v_cl.size() - 1) {
+                b.resize(Sys_eqs::nvar * tot + 1, Sys_eqs::nvar * sz + 1);
+            } else {
+                b.resize(Sys_eqs::nvar * tot + 1, Sys_eqs::nvar * sz);
+            }
+        }
+            //Use Custom number of constraints using opt as an integer
+        else {
+            if (v_cl.rank() == v_cl.size() - 1) {
+                b.resize(Sys_eqs::nvar * tot - offset, Sys_eqs::nvar * sz - offset);
+            } else {
+                b.resize(Sys_eqs::nvar * tot - offset, Sys_eqs::nvar * sz);
+            }
+        }
+        // Calculate the starting point
+        // Counter
+        size_t cnt = 0;
+        // Create the re-mapping grid
+        auto it = p_map.getDomainIterator();
+        while (it.isNext()) {
+            auto key = it.get();
+            for (int i = 0; i < particles_type::dims; i++) {
+                p_map.getPos(key)[i] = parts.getPos(key)[i];
+            }
+            p_map.template getProp<0>(key) = cnt + s_pnt;
+            ++cnt;
+            ++it;
+        }
+        // sync the ghost
+        p_map.template ghost_get<0>();
+    }
+    //! Encapsulation of the b term as constant
+    struct constant_b {
+        //! scalar
+        typename Sys_eqs::stype scal;
+        /*! \brief Constrictor from a scalar
+         *
+         * \param scal scalar
+         *
+         */
+        constant_b(typename Sys_eqs::stype scal) {
+            this->scal = scal;
+        }
+        /*! \brief Get the b term on a grid point
+         *
+         * \note It does not matter the grid point it is a scalar
+         *
+         * \param  key grid position (unused because it is a constant)
+         *
+         * \return the scalar
+         *
+         */
+        typename Sys_eqs::stype get(size_t key) {
+            return scal;
+        }
+    };
+    //! Encapsulation of the b term as constant
+    template<unsigned int prp_id>
+    struct variable_b {
+        //! scalar
+        typename Sys_eqs::stype scal;
+        particles_type &parts;
+        /*! \brief Constrictor from a scalar
+         *
+         * \param scal scalar
+         *
+         */
+        variable_b(particles_type &parts)
+                : parts(parts) {}
+        /*! \brief Get the b term on a grid point
+         *
+         * \note It does not matter the grid point it is a scalar
+         *
+         * \param  key grid position (unused because it is a constant)
+         *
+         * \return the scalar
+         *
+         */
+        inline typename Sys_eqs::stype get(size_t key) {
+            return parts.template getProp<prp_id>(key);
+        }
+    };
+    /*! \brief Check if the Matrix is consistent
+ *
+ */
+    void consistency() {
+        openfpm::vector<triplet> &trpl = A.getMatrixTriplets();
+        // A and B must have the same rows
+        if (row != row_b) {
+            std::cerr << "Error " << __FILE__ << ":" << __LINE__
+                      << " the term B and the Matrix A for Ax=B must contain the same number of rows " << row << "!=" << row_b << "\n";
+            return;
+        }
+        if (row_b != p_map.size_local() * Sys_eqs::nvar) {
+            std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " your system is underdetermined you set "
+                      << row_b << " conditions " << " but i am expecting " << p_map.size_local() * Sys_eqs::nvar
+                      << std::endl;
+            return;
+        }
+        // Indicate all the non zero rows
+        openfpm::vector<unsigned char> nz_rows;
+        nz_rows.resize(row_b);
+        for (size_t i = 0; i < trpl.size(); i++) {
+            if (trpl.get(i).row() - s_pnt * Sys_eqs::nvar >= nz_rows.size()) {
+                std::cerr << "Error " << __FILE__ << ":" << __LINE__
+                          << " It seems that you are setting colums that does not exist \n";
+            }
+            if (trpl.get(i).value() != 0) { nz_rows.get(trpl.get(i).row() - s_pnt * Sys_eqs::nvar) = true; }
+        }
+        // Indicate all the non zero colums
+        // This check can be done only on single processor
+        Vcluster<> &v_cl = create_vcluster();
+        if (v_cl.getProcessingUnits() == 1) {
+            openfpm::vector<unsigned> nz_cols;
+            nz_cols.resize(row_b);
+            for (size_t i = 0; i < trpl.size(); i++) {
+                if (trpl.get(i).value() != 0) { nz_cols.get(trpl.get(i).col()) = true; }
+            }
+            // all the rows must have a non zero element
+            for (size_t i = 0; i < nz_rows.size(); i++) {
+                if (nz_rows.get(i) == false) {
+                    std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Ill posed matrix row " << i
+                              << " is not filled " << " equation: " << "\n";
+                }
+            }
+            // all the colums must have a non zero element
+            for (size_t i = 0; i < nz_cols.size(); i++) {
+                if (nz_cols.get(i) == false)
+                    std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Ill posed matrix colum " << i
+                              << " is not filled\n";
+            }
+        }
+    }
+    /*! \brief Solve an equation
+    *
+    *  \warning exp must be a scalar type
+    *
+    * \param exp where to store the result
+    *
+    */
+    template<typename solType, typename expr_type>
+    void copy_impl(solType & x, expr_type exp, unsigned int comp)
+    {
+        auto & parts = exp.getVector();
+        auto it = parts.getDomainIterator();
+        while (it.isNext()) {
+            auto p = it.get();
+            exp.value(p) = x(p.getKey() * Sys_eqs::nvar + comp + s_pnt * Sys_eqs::nvar);
+            ++it;
+        }
+    }
+    template<typename solType, typename exp1, typename ... othersExp>
+    void copy_nested(solType &x, unsigned int &comp, exp1 exp, othersExp ... exps) {
+        copy_impl(x, exp, comp);
+        comp++;
+        copy_nested(x, comp, exps ...);
+    }
+    template<typename solType, typename exp1>
+    void copy_nested(solType &x, unsigned int &comp, exp1 exp) {
+        copy_impl(x, exp, comp);
+        comp++;
+    }
+    /*! \brief Set the structure of the system of equation
+     *
+     * For example for stokes-flow where you are solving for V = velocity (Vector) and P = pressure (scalar)
+     *
+     * you should call this function with
+     *
+     * setEquationStructure({eq_struct::VECTOR,eq_struct::SCALAR})
+     *
+     */
+/*    void setEquationStructure(std::initializer_list<eq_struct> l)
+    {
+    	int i = 0;
+    	for (eq_struct e : l)
+    	{
+    		if (e == eq_struct::VECTOR)
+    		{
+    			for (int j = 0 ; j < Sys_eqs::dims ; j++)
+    			{
+    				col_sm[i+j] = i;
+    			}
+    			i += Sys_eqs::dims;
+    		}
+    		else
+    		{
+    			col_sm[i] = i;
+    		}
+    	}
+    }*/
+    /*! \brief Solve an equation
+     *
+     *  \warning exp must be a scalar type
+     *
+     * \param exp where to store the result
+     *
+     */
+    template<typename ... expr_type>
+    void solve(expr_type ... exps) {
+        if (sizeof...(exps) != Sys_eqs::nvar) {
+            std::cerr << __FILE__ << ":" << __LINE__ << " Error the number of properties you gave does not match the solution in\
+    													dimensionality, I am expecting " << Sys_eqs::nvar <<
+                      " properties " << std::endl;
+        };
+        typename Sys_eqs::solver_type solver;
+//        umfpack_solver<double> solver;
+        auto x = solver.solve(getA(opt), getB(opt));
+        unsigned int comp = 0;
+        copy_nested(x, comp, exps ...);
+    }
+    /*! \brief Solve an equation
+     *
+     *  \warning exp must be a scalar type
+     *
+     * \param Solver Manually created Solver instead from the Equation structure
+     * \param exp where to store the result
+     *
+     */
+    template<typename SolverType, typename ... expr_type>
+    void solve_with_solver(SolverType &solver, expr_type ... exps) {
+#ifdef SE_CLASS1
+        if (sizeof...(exps) != Sys_eqs::nvar) {
+            std::cerr << __FILE__ << ":" << __LINE__ << " Error the number of properties you gave does not match the solution in\
+    													dimensionality, I am expecting " << Sys_eqs::nvar <<
+                      " properties " << std::endl;
+        };
+        auto x = solver.solve(getA(opt), getB(opt));
+        unsigned int comp = 0;
+        copy_nested(x, comp, exps ...);
+    }
+    /*! \brief Solve an equation
+     *
+     *  \warning exp must be a scalar type
+     *
+     * \param exp where to store the result
+     *
+     */
+    template<typename SolverType, typename ... expr_type>
+    void try_solve_with_solver(SolverType &solver, expr_type ... exps) {
+        if (sizeof...(exps) != Sys_eqs::nvar) {
+            std::cerr << __FILE__ << ":" << __LINE__ << " Error the number of properties you gave does not match the solution in\
+    													dimensionality, I am expecting " << Sys_eqs::nvar <<
+                      " properties " << std::endl;
+        };
+        auto x = solver.try_solve(getA(opt), getB(opt));
+        unsigned int comp = 0;
+        copy_nested(x, comp, exps ...);
+    }
+    void reset_b()
+    {
+    	row_b = 0;
+    }
+    void reset(particles_type &part, options_solver opt = options_solver::STANDARD)
+    {
+    	row = 0;
+    	row_b = 0;
+    	p_map.clear();
+    	p_map.resize(part.size_local());
+    	A.getMatrixTriplets().clear();
+    	construct_pmap(opt);
+    }
+    void reset_nodec()
+    {
+    	row = 0;
+    	row_b = 0;
+    	A.getMatrixTriplets().clear();
+    }
+    /*! \brief Constructor for the solver
+     *
+     *
+     * \param parts Particle set
+     * \param option_solver opt=options_solver::LAGRANGE_MULTIPLIER can be used for purely Neumann system
+     *
+     */
+    DCPSE_scheme_gpu(particles_type &part, options_solver opt = options_solver::STANDARD)
+            : parts(part), p_map(part.getDecomposition(), 0), row(0), row_b(0), opt(opt) {
+        p_map.resize(part.size_local());
+        construct_pmap(opt);
+    }
+    /*DCPSE_scheme_gpu(particles_type &part, int option_num)
+            : parts(part), p_map(part.getDecomposition(), 0), row(0), row_b(0),opt(options_solver::CUSTOM),offset(option_num) {
+        p_map.resize(part.size_local());
+        construct_pmap(option_num);
+    }*/
+    /*! \brief Impose an operator in the Matrix System
+    *
+    * This function impose an operator on a particular particle region to produce the system
+    *
+    * Ax = b
+    *
+    *
+    * \param op Operator to impose (A term)
+    * \param subset Vector with indices of particles where the operator has to be imposed
+    * \param prp_id<>() Property number in the aggregate (Scalar only) for imposing on the RHS b.
+    * \param id Equation id in the system that we are imposing given by ed_id type
+    *
+    */
+    template<typename T, typename index_type, unsigned int prp_id>
+    void impose(const T &op, openfpm::vector<index_type> &subset,
+                const prop_id<prp_id> &num,
+                eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        variable_b<prp_id> vb(parts);
+        impose_git(op, vb, id.getId(), itd);
+    }
+    /*! \brief Impose b part only in the Matrix System Ax=b
+    *
+    * This function impose RHS of an existing Ax=b system.
+    *
+    *
+    * \param subset Vector with indices of particles where the operator has to be imposed
+    * \param num right hand side of the term (b term) Constant in this case
+    * \param id Equation id in the system that we are imposing given by ed_id type
+    *
+    */
+    template<typename index_type, unsigned int prp_id>
+    void impose_b(openfpm::vector<index_type> &subset,
+                const prop_id<prp_id> &num,
+                eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        variable_b<prp_id> vb(parts);
+        impose_git_b(vb, id.getId(), itd);
+    }
+    /*! \brief Impose an operator in the Matrix System
+     *
+     * This function impose an operator on a particular particle region to produce the system
+     *
+     * Ax = b
+     *
+     *
+     * \param op Operator to impose (A term)
+     * \param subset Vector with indices of particles where the operator has to be imposed
+     * \param RHS Expression of the Vector to be imposed
+     * \param id Equation id in the system that we are imposing given by ed_id type
+     *
+     */
+    template<typename T, typename index_type, typename RHS_type, typename sfinae = typename std::enable_if<!std::is_fundamental<RHS_type>::type::value>::type>
+    void impose(const T &op, openfpm::vector<index_type> &subset,
+                const RHS_type &rhs,
+                eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        impose_git(op, rhs, id.getId(), itd);
+    }
+    /*! \brief Impose b part only in the Matrix System Ax=b
+    *
+    * This function impose RHS of an existing Ax=b system.
+    *
+    *
+    *
+    * \param subset Vector with indices of particles where the operator has to be imposed
+    * \param Expression of the Vector to be imposed
+    * \param id Equation id in the system that we are imposing given by ed_id type
+    *
+    */
+    template<typename index_type, typename RHS_type, typename sfinae = typename std::enable_if<!std::is_fundamental<RHS_type>::type::value>::type>
+    void impose_b(openfpm::vector<index_type> &subset,
+                const RHS_type &rhs,
+                eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        impose_git_b(rhs, id.getId(), itd);
+    }
+    /*! \brief Impose an operator in the Matrix System
+     *
+     * This function impose an operator on a particular particle region to produce the system
+     *
+     * Ax = b
+     *
+     *
+     * \param op Operator to impose (A term)
+     * \param subset Vector with indices of particles where the operator has to be imposed
+     * \param num Constant for all the particles
+     * \param id Equation id in the system that we are imposing given by ed_id type
+     *
+     */
+    template<typename T, typename index_type>
+    void impose(const T &op,
+                openfpm::vector<index_type> &subset,
+                const typename Sys_eqs::stype num,
+                eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        constant_b b(num);
+        impose_git(op, b, id.getId(), itd);
+    }
+    /*! \brief Impose b part only in the Matrix System Ax=b
+    *
+    * This function impose RHS of an existing Ax=b system.
+    *
+    *
+    *
+    * \param subset Vector with indices of particles where the operator has to be imposed
+    * \param num Constant for all the particles
+    * \param id Equation id in the system that we are imposing given by ed_id type
+    *
+    */
+    template< typename index_type>
+    void impose_b(openfpm::vector<index_type> &subset,
+                const typename Sys_eqs::stype num,
+                eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        constant_b b(num);
+        impose_git_b(b, id.getId(), itd);
+    }
+    /*! \brief produce the Matrix
+ *
+ *  \return the Sparse matrix produced
+ *
+ */
+    template<typename options>
+    typename Sys_eqs::SparseMatrix_type &getA(options opt) {
+#ifdef SE_CLASS1
+        consistency();
+        if (opt == options_solver::STANDARD) {
+            A.resize(tot * Sys_eqs::nvar, tot * Sys_eqs::nvar,
+                     p_map.size_local() * Sys_eqs::nvar,
+                     p_map.size_local() * Sys_eqs::nvar);
+        }
+        else if (opt == options_solver::LAGRANGE_MULTIPLIER) {
+            auto &v_cl = create_vcluster();
+            openfpm::vector<triplet> &trpl = A.getMatrixTriplets();
+            if (v_cl.rank() == v_cl.size() - 1) {
+                A.resize(tot * Sys_eqs::nvar + 1, tot * Sys_eqs::nvar + 1,
+                         p_map.size_local() * Sys_eqs::nvar + 1,
+                         p_map.size_local() * Sys_eqs::nvar + 1);
+                for (int i = 0; i < tot * Sys_eqs::nvar; i++) {
+                    triplet t1;
+                    t1.row() = tot * Sys_eqs::nvar;
+                    t1.col() = i;
+                    t1.value() = 1;
+                    trpl.add(t1);
+                }
+                for (int i = 0; i < p_map.size_local() * Sys_eqs::nvar; i++) {
+                    triplet t2;
+                    t2.row() = i + s_pnt * Sys_eqs::nvar;
+                    t2.col() = tot * Sys_eqs::nvar;
+                    t2.value() = 1;
+                    trpl.add(t2);
+                }
+                triplet t3;
+                t3.col() = tot * Sys_eqs::nvar;
+                t3.row() = tot * Sys_eqs::nvar;
+                t3.value() = 0;
+                trpl.add(t3);
+                row_b++;
+                row++;
+            }
+            else {
+                A.resize(tot * Sys_eqs::nvar + 1, tot * Sys_eqs::nvar + 1,
+                         p_map.size_local() * Sys_eqs::nvar,
+                         p_map.size_local() * Sys_eqs::nvar);
+                for (int i = 0; i < p_map.size_local() * Sys_eqs::nvar; i++) {
+                    triplet t2;
+                    t2.row() = i + s_pnt * Sys_eqs::nvar;
+                    t2.col() = tot * Sys_eqs::nvar;
+                    t2.value() = 1;
+                    trpl.add(t2);
+                }
+            }
+        }
+        else{
+            auto &v_cl = create_vcluster();
+            if (v_cl.rank() == v_cl.size() - 1) {
+                A.resize(tot * Sys_eqs::nvar - offset, tot * Sys_eqs::nvar - offset,
+                         p_map.size_local() * Sys_eqs::nvar - offset,
+                         p_map.size_local() * Sys_eqs::nvar - offset);
+            }
+            else {
+                A.resize(tot * Sys_eqs::nvar - offset, tot * Sys_eqs::nvar - offset,
+                         p_map.size_local() * Sys_eqs::nvar,
+                         p_map.size_local() * Sys_eqs::nvar);
+                }
+            }
+        return A;
+    }
+    /*! \brief produce the B vector
+     *
+     *  \return the vector produced
+     *
+     */
+    typename Sys_eqs::Vector_type &getB(options_solver opt = options_solver::STANDARD) {
+#ifdef SE_CLASS1
+        consistency();
+        if (opt == options_solver::LAGRANGE_MULTIPLIER) {
+            auto &v_cl = create_vcluster();
+            if (v_cl.rank() == v_cl.size() - 1) {
+                b(tot * Sys_eqs::nvar) = 0;
+            }
+        }
+        return b;
+    }
+    template<typename bop, typename iterator>
+    void impose_git_b(bop num,
+                      long int id,
+                      const iterator &it_d) {
+        auto it = it_d;
+        // iterate all particles points
+        while (it.isNext()) {
+            // get the particle
+            auto key = it.get();
+            // Calculate the non-zero colums
+            b(p_map.template getProp<0>(key) * Sys_eqs::nvar + id) = num.get(key);
+//       std::cout << "b=(" << p_map.template getProp<0>(key)*Sys_eqs::nvar + id << "," << num.get(key)<<")" <<"\n";
+            // if SE_CLASS1 is defined check the position
+#ifdef SE_CLASS1
+            //			T::position(key,gs,s_pos);
+            ++row_b;
+            ++it;
+        }
+    }
+    /*! \brief Impose an operator
+     *
+     * This function impose an operator on a particular grid region to produce the system
+     *
+     * Ax = b
+     *
+     * ## Stokes equation 2D, lid driven cavity with one splipping wall
+     * \snippet eq_unit_test.hpp Copy the solution to grid
+     *
+     * \param op Operator to impose (A term)
+     * \param num right hand side of the term (b term)
+     * \param id Equation id in the system that we are imposing
+     * \param it_d iterator that define where you want to impose
+     *
+     */
+    template<typename T, typename bop, typename iterator>
+    void impose_git(const T &op,
+                    bop num,
+                    long int id,
+                    const iterator &it_d) {
+        openfpm::vector<triplet> &trpl = A.getMatrixTriplets();
+        auto it = it_d;
+        //std::unordered_map<long int, typename particles_type::stype> cols;
+        tsl::hopscotch_map<long int, typename particles_type::stype> cols;
+        // iterate all particles points
+        while (it.isNext()) {
+            // get the particle
+            auto key = it.get();
+            if (key == 298 && create_vcluster().rank() == 1)
+            {
+            	int debug = 0;
+            	debug++;
+            }
+            // Calculate the non-zero colums
+            typename Sys_eqs::stype coeff = 1.0;
+            op.template value_nz<Sys_eqs>(p_map, key, cols, coeff, 0);
+            // indicate if the diagonal has been set
+            bool is_diag = false;
+            // create the triplet
+            for (auto it = cols.begin(); it != cols.end(); ++it) {
+                trpl.add();
+                trpl.last().row() = p_map.template getProp<0>(key) * Sys_eqs::nvar + id;
+                trpl.last().col() = it->first;
+                trpl.last().value() = it->second;
+                if (trpl.last().row() == trpl.last().col())
+                {is_diag = true;}
+            }
+            // If does not have a diagonal entry put it to zero
+            if (is_diag == false)
+            {
+                trpl.add();
+                trpl.last().row() = p_map.template getProp<0>(key) * Sys_eqs::nvar + id;
+                trpl.last().col() = p_map.template getProp<0>(key) * Sys_eqs::nvar + id;
+                trpl.last().value() = 0.0;
+            }
+            b(p_map.template getProp<0>(key) * Sys_eqs::nvar + id) = num.get(key);
+            cols.clear();
+            // if SE_CLASS1 is defined check the position
+#ifdef SE_CLASS1
+            //			T::position(key,gs,s_pos);
+            ++row;
+            ++row_b;
+            ++it;
+        }
+    }
diff --git a/src/DCPSE/DCPSE_op/DCPSE_Solver.hpp b/src/DCPSE/DCPSE_op/DCPSE_Solver.hpp
index a9143de5..63f80d28 100644
--- a/src/DCPSE/DCPSE_op/DCPSE_Solver.hpp
+++ b/src/DCPSE/DCPSE_op/DCPSE_Solver.hpp
@@ -44,6 +44,9 @@ class DCPSE_scheme {
     //! Vector b
     typename Sys_eqs::Vector_type b;
+    //! Vector x_ig (initial guess)
+    typename Sys_eqs::Vector_type x_ig;
     //! Sparse matrix triplet type
     typedef typename Sys_eqs::SparseMatrix_type::triplet_type triplet;
@@ -74,6 +77,9 @@ class DCPSE_scheme {
     //! row on b
     size_t row_b;
+    //! row on x_ig
+    size_t row_x_ig;
     //! Total number of points
     size_t tot;
@@ -108,19 +114,25 @@ class DCPSE_scheme {
         // resize b if needed
         if (opt == options_solver::STANDARD) {
             b.resize(Sys_eqs::nvar * tot, Sys_eqs::nvar * sz);
+            x_ig.resize(Sys_eqs::nvar * tot, Sys_eqs::nvar * sz);
         } else if (opt == options_solver::LAGRANGE_MULTIPLIER) {
             if (v_cl.rank() == v_cl.size() - 1) {
-                b.resize(Sys_eqs::nvar * tot + 1, Sys_eqs::nvar * sz + 1);
+                b.resize(Sys_eqs::nvar * (tot + 1), Sys_eqs::nvar * (sz + 1));
+                x_ig.resize(Sys_eqs::nvar * (tot + 1), Sys_eqs::nvar * (sz + 1));
             } else {
                 b.resize(Sys_eqs::nvar * tot + 1, Sys_eqs::nvar * sz);
+                x_ig.resize(Sys_eqs::nvar * tot + 1, Sys_eqs::nvar * sz);
             //Use Custom number of constraints using opt as an integer
         else {
             if (v_cl.rank() == v_cl.size() - 1) {
                 b.resize(Sys_eqs::nvar * tot - offset, Sys_eqs::nvar * sz - offset);
+                x_ig.resize(Sys_eqs::nvar * tot - offset, Sys_eqs::nvar * sz - offset);
             } else {
                 b.resize(Sys_eqs::nvar * tot - offset, Sys_eqs::nvar * sz);
+                x_ig.resize(Sys_eqs::nvar * tot - offset, Sys_eqs::nvar * sz);
@@ -211,25 +223,35 @@ class DCPSE_scheme {
     /*! \brief Check if the Matrix is consistent
-    void consistency() {
+    void consistency(options_solver opt)
+    {
         openfpm::vector<triplet> &trpl = A.getMatrixTriplets();
+        Vcluster<> &v_cl = create_vcluster();
         // A and B must have the same rows
         if (row != row_b) {
             std::cerr << "Error " << __FILE__ << ":" << __LINE__
-                      << " the term B and the Matrix A for Ax=B must contain the same number of rows " << row << "!=" << row_b << "\n";
+                      << " the term B and the Matrix A for Ax=B must contain the same number of rows " << row
+                      << "!=" << row_b << "\n";
         if (row_b != p_map.size_local() * Sys_eqs::nvar) {
             std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " your system is underdetermined you set "
                       << row_b << " conditions " << " but i am expecting " << p_map.size_local() * Sys_eqs::nvar
                       << std::endl;
         // Indicate all the non zero rows
         openfpm::vector<unsigned char> nz_rows;
-        nz_rows.resize(row_b);
+        if (v_cl.rank() == v_cl.size()-1 && opt == options_solver::LAGRANGE_MULTIPLIER) {
+            nz_rows.resize(row_b+Sys_eqs::nvar);
+            }
+        else{
+            nz_rows.resize(row_b);
+        };
         for (size_t i = 0; i < trpl.size(); i++) {
             if (trpl.get(i).row() - s_pnt * Sys_eqs::nvar >= nz_rows.size()) {
@@ -242,10 +264,14 @@ class DCPSE_scheme {
         // Indicate all the non zero colums
         // This check can be done only on single processor
-        Vcluster<> &v_cl = create_vcluster();
         if (v_cl.getProcessingUnits() == 1) {
             openfpm::vector<unsigned> nz_cols;
-            nz_cols.resize(row_b);
+            if (v_cl.rank() == v_cl.size()-1 && opt == options_solver::LAGRANGE_MULTIPLIER) {
+                nz_cols.resize(row_b+Sys_eqs::nvar);
+            }
+            else{
+                nz_cols.resize(row_b);
+            };
             for (size_t i = 0; i < trpl.size(); i++) {
                 if (trpl.get(i).value() != 0) { nz_cols.get(trpl.get(i).col()) = true; }
@@ -382,6 +408,78 @@ public:
         copy_nested(x, comp, exps ...);
+    /*! \brief Solve an equation
+ *
+ *  \warning exp must be a scalar type
+ *
+ * \param Solver Manually created Solver instead from the Equation structure
+ * \param exp where to store the result
+ *
+ */
+    template<typename SolverType, typename ... expr_type>
+    void solve_with_solver_ig(SolverType &solver,expr_type ... exps) {
+#ifdef SE_CLASS1
+        if (sizeof...(exps) != Sys_eqs::nvar) {
+            std::cerr << __FILE__ << ":" << __LINE__ << " Error the number of properties you gave does not match the solution in\
+    													dimensionality, I am expecting " << Sys_eqs::nvar <<
+                      " properties " << std::endl;
+        };
+        auto x = solver.solve(getA(opt),get_x_ig(opt),getB(opt));
+        unsigned int comp = 0;
+        copy_nested(x, comp, exps ...);
+    }
+        /*! \brief Successive Solve an equation
+ *
+ *  \warning exp must be a scalar type
+ *
+ * \param Solver Manually created Solver instead from the Equation structure
+ * \param exp where to store the result
+ *
+ */
+    template<typename SolverType, typename ... expr_type>
+    void solve_with_solver_successive(SolverType &solver,expr_type ... exps) {
+#ifdef SE_CLASS1
+        if (sizeof...(exps) != Sys_eqs::nvar) {
+            std::cerr << __FILE__ << ":" << __LINE__ << " Error the number of properties you gave does not match the solution in\
+    													dimensionality, I am expecting " << Sys_eqs::nvar <<
+                      " properties " << std::endl;
+        };
+        auto x = solver.solve_successive(getB(opt));
+        unsigned int comp = 0;
+        copy_nested(x, comp, exps ...);
+    }
+    /*! \brief Successive Solve an equation with inital guess
+ *
+ *  \warning exp must be a scalar type
+ *
+ * \param Solver Manually created Solver instead from the Equation structure
+ * \param exp where to store the result
+ *
+ */
+    template<typename SolverType, typename ... expr_type>
+    void solve_with_solver_ig_successive(SolverType &solver,expr_type ... exps) {
+#ifdef SE_CLASS1
+        if (sizeof...(exps) != Sys_eqs::nvar) {
+            std::cerr << __FILE__ << ":" << __LINE__ << " Error the number of properties you gave does not match the solution in\
+    													dimensionality, I am expecting " << Sys_eqs::nvar <<
+                      " properties " << std::endl;
+        };
+        auto x = solver.solve_successive(get_x_ig(opt),getB(opt));
+        unsigned int comp = 0;
+        copy_nested(x, comp, exps ...);
+    }
     /*! \brief Solve an equation with a given Nullspace
      *  \warning exp must be a scalar type
@@ -424,7 +522,7 @@ public:
                       " properties " << std::endl;
-        auto x = solver.with_constant_nullspace_solve(getA(opt), getB(opt));
+        auto x = solver.with_nullspace_solve(getA(opt), getB(opt));
         unsigned int comp = 0;
         copy_nested(x, comp, exps ...);
@@ -455,13 +553,19 @@ public:
     	row_b = 0;
+    void reset_x_ig()
+    {
+        row_x_ig = 0;
+    }
     void reset(particles_type &part, options_solver opt = options_solver::STANDARD)
     	row = 0;
     	row_b = 0;
+        row_x_ig = 0;
-    	p_map.clear();
+        p_map.clear();
@@ -473,6 +577,7 @@ public:
     	row = 0;
     	row_b = 0;
+        row_x_ig = 0;
@@ -543,6 +648,27 @@ public:
         impose_git_b(vb, id.getId(), itd);
+    /*! \brief Impose x as initial guess for the Matrix System Ax=b
+    *
+    * This function impose an initial guess for the matrix solver Ax=b
+    *
+    *
+    * \param subset Vector with indices of particles where the operator has to be imposed
+    * \param the constant guess num.
+    * \param id Equation id in the system that we are imposing given by ed_id type
+    *
+    */
+    template<typename index_type, unsigned int prp_id>
+    void impose_x_ig(openfpm::vector<index_type> &subset,
+                  const prop_id<prp_id> &num,
+                  eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        variable_b<prp_id> vx(parts);
+        impose_git_x(vx, id.getId(), itd);
+    }
     /*! \brief Impose an operator in the Matrix System
      * This function impose an operator on a particular particle region to produce the system
@@ -582,6 +708,22 @@ public:
         auto itd = subset.template getIteratorElements<0>();
         impose_git_b(rhs, id.getId(), itd);
+    /*! \brief Impose initial guess x in the Matrix System Ax=b
+    *
+    * This function impose initial guess x of an existing Ax=b system.
+    *
+    * \param subset Vector with indices of particles where the operator has to be imposed as a guess
+    * \param num Constant for all the particles
+    * \param id Equation id in the system that we are imposing given by ed_id type
+    *
+    */
+    template<typename index_type, typename RHS_type, typename sfinae = typename std::enable_if<!std::is_fundamental<RHS_type>::type::value>::type>
+    void impose_x_ig(openfpm::vector<index_type> &subset,
+                  const RHS_type &rhs,
+                  eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        impose_git_x(rhs, id.getId(), itd);
+    }
     /*! \brief Impose an operator in the Matrix System
@@ -629,6 +771,28 @@ public:
         impose_git_b(b, id.getId(), itd);
+    /*! \brief Impose initial guess x in the Matrix System Ax=b
+* This function impose RHS of an existing Ax=b system.
+* \param subset Vector with indices of particles where the operator has to be imposed as a guess
+* \param num Constant for all the particles
+* \param id Equation id in the system that we are imposing given by ed_id type
+    template< typename index_type>
+    void impose_x_ig(openfpm::vector<index_type> &subset,
+                  const typename Sys_eqs::stype num,
+                  eq_id id = eq_id()) {
+        auto itd = subset.template getIteratorElements<0>();
+        constant_b x_ig(num);
+        impose_git_x(x_ig, id.getId(), itd);
+    }
     /*! \brief produce the Matrix
  *  \return the Sparse matrix produced
@@ -636,9 +800,6 @@ public:
     template<typename options>
     typename Sys_eqs::SparseMatrix_type &getA(options opt) {
-#ifdef SE_CLASS1
-        consistency();
         if (opt == options_solver::STANDARD) {
             A.resize(tot * Sys_eqs::nvar, tot * Sys_eqs::nvar,
                      p_map.size_local() * Sys_eqs::nvar,
@@ -649,60 +810,45 @@ public:
             openfpm::vector<triplet> &trpl = A.getMatrixTriplets();
             if (v_cl.rank() == v_cl.size() - 1) {
-                A.resize(tot * Sys_eqs::nvar + 1, tot * Sys_eqs::nvar + 1,
-                         p_map.size_local() * Sys_eqs::nvar + 1,
-                         p_map.size_local() * Sys_eqs::nvar + 1);
-                for (int i = 0; i < tot * Sys_eqs::nvar; i++) {
-                    triplet t1;
-                    t1.row() = tot * Sys_eqs::nvar;
-                    t1.col() = i;
-                    t1.value() = 1;
-                    trpl.add(t1);
-                }
-                for (int i = 0; i < p_map.size_local() * Sys_eqs::nvar; i++) {
-                    triplet t2;
-                    t2.row() = i + s_pnt * Sys_eqs::nvar;
-                    t2.col() = tot * Sys_eqs::nvar;
-                    t2.value() = 1;
-                    trpl.add(t2);
+                A.resize(Sys_eqs::nvar * (tot + 1), Sys_eqs::nvar * (tot + 1),
+                         Sys_eqs::nvar * (p_map.size_local() + 1),
+                         Sys_eqs::nvar * (p_map.size_local() + 1));
+                for (int j = 0; j < Sys_eqs::nvar; j++) {
+                    for (int i = 0; i < tot; i++) {
+                        triplet t1;
+                        t1.row() = tot * Sys_eqs::nvar + j;
+                        t1.col() = i * Sys_eqs::nvar + j;
+                        t1.value() = 1;
+                        trpl.add(t1);
+                    }
+                    for (int i = 0; i < p_map.size_local(); i++) {
+                        triplet t2;
+                        t2.row() = s_pnt + i * Sys_eqs::nvar + j;
+                        t2.col() = tot * Sys_eqs::nvar + j;
+                        t2.value() = 1;
+                        trpl.add(t2);
+                    }
+                    triplet t3;
+                    t3.col() = tot * Sys_eqs::nvar + j;
+                    t3.row() = tot * Sys_eqs::nvar + j;
+                    t3.value() = 0;
+                    trpl.add(t3);
-                triplet t3;
-                t3.col() = tot * Sys_eqs::nvar;
-                t3.row() = tot * Sys_eqs::nvar;
-                t3.value() = 0;
-                trpl.add(t3);
-                row_b++;
-                row++;
-            }
-            else {
-                A.resize(tot * Sys_eqs::nvar + 1, tot * Sys_eqs::nvar + 1,
+            } else {
+                A.resize(Sys_eqs::nvar * (tot + 1), Sys_eqs::nvar * (tot + 1),
                          p_map.size_local() * Sys_eqs::nvar,
                          p_map.size_local() * Sys_eqs::nvar);
-                for (int i = 0; i < p_map.size_local() * Sys_eqs::nvar; i++) {
-                    triplet t2;
-                    t2.row() = i + s_pnt * Sys_eqs::nvar;
-                    t2.col() = tot * Sys_eqs::nvar;
-                    t2.value() = 1;
-                    trpl.add(t2);
+                for (int j = 0; j < Sys_eqs::nvar; j++) {
+                    for (int i = 0; i < p_map.size_local(); i++) {
+                        triplet t2;
+                        t2.row() = s_pnt + i * Sys_eqs::nvar + j;
+                        t2.col() = tot * Sys_eqs::nvar + j;
+                        t2.value() = 1;
+                        trpl.add(t2);
+                    }
             auto &v_cl = create_vcluster();
             if (v_cl.rank() == v_cl.size() - 1) {
@@ -714,9 +860,11 @@ public:
                 A.resize(tot * Sys_eqs::nvar - offset, tot * Sys_eqs::nvar - offset,
                          p_map.size_local() * Sys_eqs::nvar,
                          p_map.size_local() * Sys_eqs::nvar);
-                }
+        }
+#ifdef SE_CLASS1
+        consistency(opt);
         return A;
@@ -727,19 +875,38 @@ public:
     typename Sys_eqs::Vector_type &getB(options_solver opt = options_solver::STANDARD) {
-#ifdef SE_CLASS1
-        consistency();
+/*#ifdef SE_CLASS1
+        consistency(opt);
         if (opt == options_solver::LAGRANGE_MULTIPLIER) {
             auto &v_cl = create_vcluster();
             if (v_cl.rank() == v_cl.size() - 1) {
-                b(tot * Sys_eqs::nvar) = 0;
+                for(int j=0;j<Sys_eqs::nvar;j++)
+                {b(tot * Sys_eqs::nvar+j) = 0;}
         return b;
+    /*! \brief produce the B vector
+     *
+     *  \return the vector produced
+     *
+     */
+    typename Sys_eqs::Vector_type &get_x_ig(options_solver opt = options_solver::STANDARD) {
+/*#ifdef SE_CLASS1
+        consistency(opt);
+        if (opt == options_solver::LAGRANGE_MULTIPLIER) {
+            auto &v_cl = create_vcluster();
+            if (v_cl.rank() == v_cl.size() - 1) {
+                for(int j=0;j<Sys_eqs::nvar;j++)
+                    {x_ig(tot * Sys_eqs::nvar+j) = 0;}
+            }
+        }
+        return x_ig;
+    }
     template<typename bop, typename iterator>
     void impose_git_b(bop num,
@@ -763,6 +930,29 @@ public:
+    template<typename xop, typename iterator>
+    void impose_git_x(xop num,
+                      long int id,
+                      const iterator &it_d) {
+        auto it = it_d;
+        // iterate all particles points
+        while (it.isNext()) {
+            // get the particle
+            auto key = it.get();
+            // Calculate the non-zero colums
+            x_ig(p_map.template getProp<0>(key) * Sys_eqs::nvar + id) = num.get(key);
+//       std::cout << "b=(" << p_map.template getProp<0>(key)*Sys_eqs::nvar + id << "," << num.get(key)<<")" <<"\n";
+            // if SE_CLASS1 is defined check the position
+#ifdef SE_CLASS1
+            //			T::position(key,gs,s_pos);
+            ++row_x_ig;
+            ++it;
+        }
+    }
     /*! \brief Impose an operator
      * This function impose an operator on a particular grid region to produce the system
@@ -839,6 +1029,7 @@ public:
+            ++row_x_ig;
diff --git a/src/DCPSE/DCPSE_op/DCPSE_op.hpp b/src/DCPSE/DCPSE_op/DCPSE_op.hpp
index b8a9a5c8..a3e4e596 100644
--- a/src/DCPSE/DCPSE_op/DCPSE_op.hpp
+++ b/src/DCPSE/DCPSE_op/DCPSE_op.hpp
@@ -12,6 +12,10 @@
 #include "Decomposition/CartDecomposition.hpp"
 #include "DCPSE/Dcpse.hpp"
 #include "Operators/Vector/vector_dist_operators.hpp"
+#if defined(__NVCC__)
+#include "DCPSE/Dcpse.cuh"
 const double dcpse_oversampling_factor = 1.9;
 const double rcut_verlet = 3.1;
@@ -646,6 +650,8 @@ public:
 /*! \brief Class for Creating the DCPSE Operator Dx and objects and computes DCPSE Kernels.
@@ -658,7 +664,8 @@ public:
  * \return Operator Dx which is a function on Vector_dist_Expressions
-class Derivative_x {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_x_T {
     void *dcpse;
@@ -676,51 +683,70 @@ public:
     template<typename particles_type>
-    Derivative_x(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_x_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                  double oversampling_factor = dcpse_oversampling_factor,
                  support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;;
         p.get(0) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
     template<unsigned int prp, typename particles_type>
     void DrawKernelNN(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernelNN<prp>(particles, k);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -728,7 +754,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -747,7 +773,8 @@ public:
  * \return Operator Dy which is a function on Vector_dist_Expressions
-class Derivative_y {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_y_T {
     void *dcpse;
@@ -766,50 +793,64 @@ public:
     template<typename particles_type>
-    Derivative_y(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_y_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                  double oversampling_factor = dcpse_oversampling_factor,
                  support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;;
         p.get(1) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
-        new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        dcpse_ptr++;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse2 = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse2 = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse2->template DrawKernel<prp>(particles, k);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -817,7 +858,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -835,7 +876,8 @@ public:
  * \return Operator Dz which is a function on Vector_dist_Expressions
-class Derivative_z {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_z_T {
     void *dcpse;
@@ -853,29 +895,49 @@ public:
     template<typename particles_type>
-    Derivative_z(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_z_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                  double oversampling_factor = dcpse_oversampling_factor,
                  support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;;
         p.get(2) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -883,14 +945,14 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -911,7 +973,8 @@ public:
      * \return Operator Grad which is a function on Vector_dist_Expressions
-class Gradient {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Gradient_T {
     void *dcpse;
@@ -931,35 +994,38 @@ public:
     template<typename particles_type>
-    Gradient(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Gradient_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
              double oversampling_factor = dcpse_oversampling_factor,
              support_options opt = support_options::RADIUS) {
-        typedef Dcpse<particles_type::dims, particles_type> DCPSE_type;
+        typedef Dcpse_type<particles_type::dims, particles_type> DCPSE_type;
         dcpse = new unsigned char[particles_type::dims * sizeof(DCPSE_type)];
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             Point<particles_type::dims, unsigned int> p;
             p.get(i) = 1;
-            new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-            dcpse_ptr++;
+            if (i)
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, dcpse_ptr[0], p, ord, rCut, oversampling_factor, opt);
+            else
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
         for (int i = 0; i < particles_type::dims; i++) {
-            delete &(((Dcpse<particles_type::dims, particles_type> *) dcpse)[i]);
+            delete &(((Dcpse_type<particles_type::dims, particles_type> *) dcpse)[i]);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE_V>(arg,
                                                                                  *(dcpse_type(*)[operand_type::vtype::dims]) dcpse);
@@ -967,7 +1033,7 @@ public:
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             dcpse_ptr[i].template DrawKernel<prp>(particles, i, k);
@@ -982,7 +1048,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
@@ -1005,7 +1071,8 @@ public:
      * \return Operator which is a function on Vector_dist_Expressions
-class Curl2D {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Curl2D_T {
     void *dcpse;
@@ -1024,29 +1091,29 @@ public:
     template<typename particles_type>
-    Curl2D(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Curl2D_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
            double oversampling_factor = dcpse_oversampling_factor, support_options opt = support_options::RADIUS) {
-        typedef Dcpse<particles_type::dims, particles_type> DCPSE_type;
+        typedef Dcpse_type<particles_type::dims, particles_type> DCPSE_type;
         dcpse = new unsigned char[particles_type::dims * sizeof(DCPSE_type)];
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         Point<particles_type::dims, unsigned int> p;
         p.get(1) = 1;
-        new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        dcpse_ptr++;
+        new(dcpse_ptr) Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
         p.get(0) = 1;
-        new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        dcpse_ptr++;
+        new(dcpse_ptr+1) Dcpse_type<particles_type::dims, particles_type>(parts, dcpse_ptr[0], p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V_CURL2D>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V_CURL2D>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE_V_CURL2D>(arg,
                                                                                         *(dcpse_type(*)[operand_type::vtype::dims]) dcpse);
@@ -1066,7 +1133,8 @@ public:
      * \return Operator which is a function on Vector_dist_Expressions
-class Laplacian {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Laplacian_T {
     void *dcpse;
@@ -1086,28 +1154,30 @@ public:
     template<typename particles_type>
-    Laplacian(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Laplacian_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
               double oversampling_factor = dcpse_oversampling_factor,
               support_options opt = support_options::RADIUS) {
-        typedef Dcpse<particles_type::dims, particles_type> DCPSE_type;
+        typedef Dcpse_type<particles_type::dims, particles_type> DCPSE_type;
         dcpse = new unsigned char[particles_type::dims * sizeof(DCPSE_type)];
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             Point<particles_type::dims, unsigned int> p;
             p.get(i) = 2;
-            new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-            dcpse_ptr++;
+            if (i)
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, dcpse_ptr[0], p, ord, rCut, oversampling_factor, opt);
+            else
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V_SUM>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V_SUM>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE_V_SUM>(arg,
                                                                                      *(dcpse_type(*)[operand_type::vtype::dims]) dcpse);
@@ -1116,7 +1186,7 @@ public:
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
@@ -1126,7 +1196,7 @@ public:
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             dcpse_ptr[i].template DrawKernel<prp>(particles, k);
@@ -1135,7 +1205,7 @@ public:
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1144,7 +1214,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
@@ -1168,7 +1238,8 @@ public:
      * \return Operator which is a function on Vector_dist_Expressions. Computes Divergence of Vectors
-class Divergence {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Divergence_T {
     void *dcpse;
@@ -1188,28 +1259,31 @@ public:
     template<typename particles_type>
-    Divergence(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Divergence_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                double oversampling_factor = dcpse_oversampling_factor,
                support_options opt = support_options::RADIUS) {
-        typedef Dcpse<particles_type::dims, particles_type> DCPSE_type;
+        typedef Dcpse_type<particles_type::dims, particles_type> DCPSE_type;
         dcpse = new unsigned char[particles_type::dims * sizeof(DCPSE_type)];
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             Point<particles_type::dims, unsigned int> p;
             p.get(i) = 1;
-            new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-            dcpse_ptr++;
+            if (i)
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, dcpse_ptr[0], p, ord, rCut, oversampling_factor, opt);
+            else
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V_DIV>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE_V_DIV>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE_V_DIV>(arg,
                                                                                      *(dcpse_type(*)[operand_type::vtype::dims]) dcpse);
@@ -1222,7 +1296,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
@@ -1245,7 +1319,8 @@ public:
      * \return Operator which is a function on Vector_dist_Expressions. Computes Advection of Vectors Adv(v,u) = v.Grad(u)
-class Advection {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Advection_T {
     void *dcpse;
@@ -1265,30 +1340,33 @@ public:
     template<typename particles_type>
-    Advection(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Advection_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
               double oversampling_factor = dcpse_oversampling_factor,
               support_options opt = support_options::RADIUS) {
-        typedef Dcpse<particles_type::dims, particles_type> DCPSE_type;
+        typedef Dcpse_type<particles_type::dims, particles_type> DCPSE_type;
         dcpse = new unsigned char[particles_type::dims * sizeof(DCPSE_type)];
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             Point<particles_type::dims, unsigned int> p;
             p.get(i) = 1;
-            new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-            dcpse_ptr++;
+            if (i)
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, dcpse_ptr[0], p, ord, rCut, oversampling_factor, opt);
+            else
+                new(&dcpse_ptr[i]) Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type1, typename operand_type2>
-    vector_dist_expression_op<operand_type1, std::pair<operand_type2, Dcpse<operand_type2::vtype::dims, typename operand_type2::vtype>>, VECT_DCPSE_V_DOT>
+    vector_dist_expression_op<operand_type1, std::pair<operand_type2, Dcpse_type<operand_type2::vtype::dims, typename operand_type2::vtype>>, VECT_DCPSE_V_DOT>
     operator()(operand_type1 arg, operand_type2 arg2) {
-        typedef Dcpse<operand_type2::vtype::dims, typename operand_type2::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type2::vtype::dims, typename operand_type2::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type1, std::pair<operand_type2, dcpse_type>, VECT_DCPSE_V_DOT>(arg,
@@ -1297,7 +1375,7 @@ public:
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
@@ -1307,7 +1385,7 @@ public:
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
             dcpse_ptr[i].template DrawKernel<prp>(particles, i, k);
@@ -1322,7 +1400,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         for (int i = 0; i < particles_type::dims; i++) {
@@ -1344,7 +1422,8 @@ public:
      * \return Operator Dxy which is a function on Vector_dist_Expressions
-class Derivative_xy {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xy_T {
     void *dcpse;
@@ -1362,7 +1441,7 @@ public:
     template<typename particles_type>
-    Derivative_xy(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_xy_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                   double oversampling_factor = dcpse_oversampling_factor,
                   support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1370,31 +1449,25 @@ public:
         p.get(0) = 1;
         p.get(1) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
-        new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        dcpse_ptr++;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_ptr[0].template DrawKernel<prp>(particles, k);
@@ -1402,11 +1475,31 @@ public:
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1414,7 +1507,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -1431,7 +1524,8 @@ public:
      * \return Operator Dyz which is a function on Vector_dist_Expressions
-class Derivative_yz {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_yz_T {
     void *dcpse;
@@ -1449,7 +1543,7 @@ public:
     template<typename particles_type>
-    Derivative_yz(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_yz_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                   double oversampling_factor = dcpse_oversampling_factor,
                   support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1457,31 +1551,25 @@ public:
         p.get(1) = 1;
         p.get(2) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
-        new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        dcpse_ptr++;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_ptr[0].template DrawKernel<prp>(particles, k);
@@ -1489,11 +1577,31 @@ public:
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1501,7 +1609,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -1518,7 +1626,8 @@ public:
      * \return Operator Dxz which is a function on Vector_dist_Expressions
-class Derivative_xz {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xz_T {
     void *dcpse;
@@ -1536,7 +1645,7 @@ public:
     template<typename particles_type>
-    Derivative_xz(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_xz_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                   double oversampling_factor = dcpse_oversampling_factor,
                   support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1544,31 +1653,25 @@ public:
         p.get(0) = 1;
         p.get(2) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
-        new(dcpse_ptr) Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
-        dcpse_ptr++;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        Dcpse<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        Dcpse_type<particles_type::dims, particles_type> *dcpse_ptr = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_ptr[0].template DrawKernel<prp>(particles, k);
@@ -1576,11 +1679,31 @@ public:
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1588,7 +1711,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -1606,7 +1729,8 @@ public:
      * \return Operator Dxx which is a function on Vector_dist_Expressions
-class Derivative_xx {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xx_T {
     void *dcpse;
@@ -1624,7 +1748,7 @@ public:
     template<typename particles_type>
-    Derivative_xx(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_xx_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                   double oversampling_factor = dcpse_oversampling_factor,
                   support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1632,36 +1756,56 @@ public:
         p.get(0) = 2;
         p.get(1) = 0;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1669,7 +1813,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -1687,7 +1831,8 @@ public:
  * \return Operator Dyy which is a function on Vector_dist_Expressions
-class Derivative_yy {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_yy_T {
     void *dcpse;
@@ -1705,7 +1850,7 @@ public:
     template<typename particles_type>
-    Derivative_yy(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_yy_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                   double oversampling_factor = dcpse_oversampling_factor,
                   support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1713,36 +1858,56 @@ public:
         p.get(0) = 0;
         p.get(1) = 2;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1750,7 +1915,7 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
@@ -1767,7 +1932,8 @@ public:
      * \return Operator Dzz which is a function on Vector_dist_Expressions
-class Derivative_zz {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_zz_T {
     void *dcpse;
@@ -1785,43 +1951,63 @@ public:
     template<typename particles_type>
-    Derivative_zz(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_zz_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                   double oversampling_factor = dcpse_oversampling_factor,
                   support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;;
         p.get(2) = 2;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename particles_type>
     void deallocate(particles_type &parts) {
-        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        delete (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1829,21 +2015,21 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
-class Derivative_xxx {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xxx_T {
     void *dcpse;
     template<typename particles_type>
-    Derivative_xxx(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_xxx_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                    double oversampling_factor = dcpse_oversampling_factor,
                    support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1851,32 +2037,52 @@ public:
         p.get(0) = 3;
         p.get(1) = 0;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1884,21 +2090,21 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
-class Derivative_xxy {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xxy_T {
     void *dcpse;
     template<typename particles_type>
-    Derivative_xxy(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_xxy_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                    double oversampling_factor = dcpse_oversampling_factor,
                    support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1906,32 +2112,52 @@ public:
         p.get(0) = 2;
         p.get(1) = 1;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1939,21 +2165,21 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
-class Derivative_yyx {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_yyx_T {
     void *dcpse;
     template<typename particles_type>
-    Derivative_yyx(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_yyx_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                    double oversampling_factor = dcpse_oversampling_factor,
                    support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -1961,32 +2187,52 @@ public:
         p.get(0) = 1;
         p.get(1) = 2;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -1994,21 +2240,21 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
-class Derivative_yyy {
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_yyy_T {
     void *dcpse;
     template<typename particles_type>
-    Derivative_yyy(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+    Derivative_yyy_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
                    double oversampling_factor = dcpse_oversampling_factor,
                    support_options opt = support_options::RADIUS) {
         Point<particles_type::dims, unsigned int> p;
@@ -2016,32 +2262,52 @@ public:
         p.get(0) = 0;
         p.get(1) = 3;
-        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
     template<typename operand_type>
-    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
     operator()(operand_type arg) {
-        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
         return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
     template<typename particles_type>
     void checkMomenta(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
     template<unsigned int prp, typename particles_type>
     void DrawKernel(particles_type &particles, int k) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
         dcpse_temp->template DrawKernel<prp>(particles, k);
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
     /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
@@ -2049,20 +2315,355 @@ public:
     template<typename particles_type>
     void update(particles_type &particles) {
-        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xxxx_T {
+    void *dcpse;
+    template<typename particles_type>
+    Derivative_xxxx_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+                   double oversampling_factor = dcpse_oversampling_factor,
+                   support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 4;
+        p.get(1) = 0;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->initializeUpdate(particles);
+    }
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_yyyy_T {
+    void *dcpse;
+    template<typename particles_type>
+    Derivative_yyyy_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+                   double oversampling_factor = dcpse_oversampling_factor,
+                   support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 0;
+        p.get(1) = 4;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->initializeUpdate(particles);
+    }
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_xxyy_T {
+    void *dcpse;
+    template<typename particles_type>
+    Derivative_xxyy_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+                   double oversampling_factor = dcpse_oversampling_factor,
+                   support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 2;
+        p.get(1) = 2;
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->initializeUpdate(particles);
+    }
+template<template<unsigned int, typename, typename...> class Dcpse_type = Dcpse>
+class Derivative_G_T {
+    void *dcpse;
+    template<typename particles_type>
+    Derivative_G_T(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,
+                   const Point<particles_type::dims, unsigned int> &p,double oversampling_factor = dcpse_oversampling_factor,
+                   support_options opt = support_options::RADIUS) {
+        dcpse = new Dcpse_type<particles_type::dims, particles_type>(parts, p, ord, rCut, oversampling_factor, opt);
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse_type<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->initializeUpdate(particles);
+    }
-//template<typename operand_type1, typename operand_type2/*, typename sfinae=typename std::enable_if<
-//																						std::is_same<typename operand_type1::it_is_a_node,int>::value
-//																						>::type*/ >
-//plus<operand_type1,operand_type2> operator+(const operand_type1 & op1, const operand_type2 & op2)
-//	return plus<operand_type1,operand_type2>(op1,op2);
-#endif /* Eigen */
+//typedef PPInterpolation_T<Dcpse> PPInterpolation;
+typedef Derivative_x_T<Dcpse> Derivative_x;
+typedef Derivative_y_T<Dcpse> Derivative_y;
+typedef Derivative_z_T<Dcpse> Derivative_z;
+typedef Gradient_T<Dcpse> Gradient;
+typedef Curl2D_T<Dcpse> Curl2D;
+typedef Laplacian_T<Dcpse> Laplacian;
+typedef Divergence_T<Dcpse> Divergence;
+typedef Advection_T<Dcpse> Advection;
+typedef Derivative_xy_T<Dcpse> Derivative_xy;
+typedef Derivative_yz_T<Dcpse> Derivative_yz;
+typedef Derivative_xz_T<Dcpse> Derivative_xz;
+typedef Derivative_xx_T<Dcpse> Derivative_xx;
+typedef Derivative_yy_T<Dcpse> Derivative_yy;
+typedef Derivative_zz_T<Dcpse> Derivative_zz;
+typedef Derivative_xxx_T<Dcpse> Derivative_xxx;
+typedef Derivative_xxy_T<Dcpse> Derivative_xxy;
+typedef Derivative_yyx_T<Dcpse> Derivative_yyx;
+typedef Derivative_yyy_T<Dcpse> Derivative_yyy;
+typedef Derivative_xxxx_T<Dcpse> Derivative_xxxx;
+typedef Derivative_yyyy_T<Dcpse> Derivative_yyyy;
+typedef Derivative_xxyy_T<Dcpse> Derivative_xxyy;
+typedef Derivative_G_T<Dcpse> Derivative_G;
+#if defined(__NVCC__)
+typedef Derivative_x_T<Dcpse_gpu> Derivative_x_gpu;
+typedef Derivative_y_T<Dcpse_gpu> Derivative_y_gpu;
+typedef Derivative_z_T<Dcpse_gpu> Derivative_z_gpu;
+typedef Gradient_T<Dcpse_gpu> Gradient_gpu;
+typedef Curl2D_T<Dcpse_gpu> Curl2D_gpu;
+typedef Laplacian_T<Dcpse_gpu> Laplacian_gpu;
+typedef Divergence_T<Dcpse_gpu> Divergence_gpu;
+typedef Advection_T<Dcpse_gpu> Advection_gpu;
+typedef Derivative_xy_T<Dcpse_gpu> Derivative_xy_gpu;
+typedef Derivative_yz_T<Dcpse_gpu> Derivative_yz_gpu;
+typedef Derivative_xz_T<Dcpse_gpu> Derivative_xz_gpu;
+typedef Derivative_xx_T<Dcpse_gpu> Derivative_xx_gpu;
+typedef Derivative_yy_T<Dcpse_gpu> Derivative_yy_gpu;
+typedef Derivative_zz_T<Dcpse_gpu> Derivative_zz_gpu;
+typedef Derivative_xxx_T<Dcpse_gpu> Derivative_xxx_gpu;
+typedef Derivative_xxy_T<Dcpse_gpu> Derivative_xxy_gpu;
+typedef Derivative_yyx_T<Dcpse_gpu> Derivative_yyx_gpu;
+typedef Derivative_yyy_T<Dcpse_gpu> Derivative_yyy_gpu;
+typedef Derivative_G_T<Dcpse_gpu> Derivative_G_gpu;
+#endif /*EIGEN */
 #endif /* DCPSE_OP_HPP_ */
diff --git a/src/DCPSE/DCPSE_op/DCPSE_surface_op.hpp b/src/DCPSE/DCPSE_op/DCPSE_surface_op.hpp
new file mode 100644
index 00000000..c2a11d4e
--- /dev/null
+++ b/src/DCPSE/DCPSE_op/DCPSE_surface_op.hpp
@@ -0,0 +1,1008 @@
+// Created by Abhinav Singh on 15.11.21.
+#ifdef HAVE_EIGEN
+#include "DCPSE/DCPSE_op/DCPSE_op.hpp"
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_x {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_x(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 1;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+        /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_y {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_y(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(1) = 1;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_z {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_z(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(2) = 1;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class Laplace_Beltrami {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    Laplace_Beltrami(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 2;
+        p.get(1) = 2;
+        p.get(2) = 2;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_xx {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_xx(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 2;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_yy {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_yy(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(1) = 2;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_zz {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_zz(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(2) = 2;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        particles.write("With Normal");
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_xy {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_xy(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 1;
+        p.get(1) = 1;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_yz {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_yz(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(1) = 1;
+        p.get(2) = 1;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_xz {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_xz(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    support_options opt = support_options::RADIUS) {
+        Point<particles_type::dims, unsigned int> p;
+        p.get(0) = 1;
+        p.get(2) = 1;
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+template<unsigned int NORMAL_ID>
+class SurfaceDerivative_G {
+    void *dcpse;
+    /*! \brief Class for Creating the DCPSE Operator Dxx and objects and computs DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator Dxx which is a function on Vector_dist_Expressions
+     *
+     */
+    template<typename particles_type>
+    SurfaceDerivative_G(particles_type &parts, unsigned int ord, typename particles_type::stype rCut,typename particles_type::stype nSpacing,
+                    const Point<particles_type::dims, unsigned int> &p,support_options opt = support_options::RADIUS) {
+        dcpse = new Dcpse<particles_type::dims, particles_type>(parts, p, ord, rCut,nSpacing,value_t<NORMAL_ID>(), opt);
+    }
+    template<typename particles_type>
+    void deallocate(particles_type &parts) {
+        delete (Dcpse<particles_type::dims, particles_type> *) dcpse;
+    }
+    template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }
+    template<typename particles_type>
+    void checkMomenta(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->checkMomenta(particles);
+    }
+    template<unsigned int prp, typename particles_type>
+    void DrawKernel(particles_type &particles, int k) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template DrawKernel<prp>(particles, k);
+    }
+    /*! \brief Method for Saving the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be saved.
+     */
+    template<typename particles_type>
+    void save(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->save(file);
+    }
+    /*! \brief Method for Loading the DCPSE Operator.
+     *
+     * \param parts particle set
+     * \param file name for data to be loaded from.
+     */
+    template<typename particles_type>
+    void load(particles_type &particles, const std::string &file) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->load(file);
+    }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    template<typename particles_type>
+    void update(particles_type &particles) {
+        auto dcpse_temp = (Dcpse<particles_type::dims, particles_type> *) dcpse;
+        dcpse_temp->template createNormalParticles<NORMAL_ID>(particles);
+        dcpse_temp->initializeUpdate(particles);
+        dcpse_temp->accumulateAndDeleteNormalParticles(particles);
+    }
+#endif //Eigen
diff --git a/src/DCPSE/DCPSE_op/EqnsStruct.hpp b/src/DCPSE/DCPSE_op/EqnsStruct.hpp
index 417216ac..6d08198f 100644
--- a/src/DCPSE/DCPSE_op/EqnsStruct.hpp
+++ b/src/DCPSE/DCPSE_op/EqnsStruct.hpp
@@ -9,6 +9,7 @@
 #include "Solvers/umfpack_solver.hpp"
 #include "Solvers/petsc_solver.hpp"
+#ifdef HAVE_PETSC
 //! Specify the general characteristic of system to solve
 struct equations2d1 {
@@ -59,7 +60,6 @@ struct equations2d2 {
     typedef petsc_solver<double> solver_type;
 struct equations2d1p {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
@@ -181,7 +181,6 @@ struct equations2d4 {
     typedef petsc_solver<double> solver_type;
 struct equations3d3 {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
@@ -278,7 +277,7 @@ struct equations3d3Pyz {
     typedef petsc_solver<double> solver_type;
-struct equations3d3EPxz {
+struct equations3d3Pxz {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
@@ -287,29 +286,29 @@ struct equations3d3EPxz {
     //! boundary at X and Y
     static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
-    //! type of space float, double, ...
+    //! type of space float, double, ..
     typedef double stype;
     //! type of base particles
     typedef vector_dist<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
-struct equations3d3EPz {
+struct equations3d1Pz {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar = 1;
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
@@ -318,28 +317,31 @@ struct equations3d3EPz {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
-struct equations3d3Pxz {
+#ifdef __NVCC__
+struct equations2d1_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 3;
+    static const unsigned int dims=2;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar=1;
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
-    //! type of space float, double, ..
+    //! type of space float, double, ...
     typedef double stype;
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
@@ -350,20 +352,20 @@ struct equations3d3Pxz {
     typedef petsc_solver<double> solver_type;
-struct equations3d1Pz {
+struct equations2d2_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 3;
+    static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 1;
+    static const unsigned int nvar = 2;
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
@@ -374,58 +376,79 @@ struct equations3d1Pz {
     typedef petsc_solver<double> solver_type;
+struct equations2d1p_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
-//! Specify the general characteristic of system to solve
-struct equations2d1E {
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef petsc_solver<double> solver_type;
+struct equations2d2p_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims=2;
+    static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar=1;
+    static const unsigned int nvar = 2;
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
-struct equations2d2E {
+struct equations2d3p_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 2;
+    static const unsigned int nvar = 3;
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
-struct equations2d3E {
+struct equations2d3_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -438,18 +461,18 @@ struct equations2d3E {
     typedef double stype;
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
-struct equations2d4E {
+struct equations2d4_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -462,50 +485,145 @@ struct equations2d4E {
     typedef double stype;
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
+struct equations3d3_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
-struct equations2d1pE {
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    //! type of space float, double, ..
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef petsc_solver<double> solver_type;
+struct equations3d1_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims = 3;
     //! number of fields in the system
     static const unsigned int nvar = 1;
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef petsc_solver<double> solver_type;
+struct equations3d3Pz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
+    //! type of space float, double, ..
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef petsc_solver<double> solver_type;
+struct equations3d3Pyz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, PERIODIC,PERIODIC};
+    //! type of space float, double, ..
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef petsc_solver<double> solver_type;
+struct equations3d3Pxz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+    //! type of space float, double, ..
+    typedef double stype;
     //! type of base particles
     typedef vector_dist<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
-struct equations2d2pE {
+struct equations3d1Pz_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims = 3;
     //! number of fields in the system
-    static const unsigned int nvar = 2;
+    static const unsigned int nvar = 1;
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
@@ -514,22 +632,28 @@ struct equations2d2pE {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
     //! type of Vector for the linear solver
-    typedef Vector<double> Vector_type;
+    typedef Vector<double, PETSC_BASE> Vector_type;
-    typedef umfpack_solver<double> solver_type;
+    typedef petsc_solver<double> solver_type;
+#endif //__NVCC__
+#endif //HAVE_PETSC
+//! Specify the general characteristic of system to solve
+struct equations2d1E {
-struct equations2d3pE {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims=2;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar=1;
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
@@ -546,16 +670,16 @@ struct equations2d3pE {
     typedef umfpack_solver<double> solver_type;
-struct equations3d3E {
+struct equations2d2E {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 3;
+    static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar = 2;
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
-    //! type of space float, double, ..
+    //! type of space float, double, ...
     typedef double stype;
     //! type of base particles
@@ -570,14 +694,14 @@ struct equations3d3E {
     typedef umfpack_solver<double> solver_type;
-struct equations3d1E {
+struct equations2d3E {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 3;
+    static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 1;
+    static const unsigned int nvar = 3;
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
     //! type of space float, double, ...
     typedef double stype;
@@ -594,5 +718,467 @@ struct equations3d1E {
     typedef umfpack_solver<double> solver_type;
+struct equations2d4E {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 4;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d1pE {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d2pE {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d3pE {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d3E {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    //! type of space float, double, ..
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d1E {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d3EPxz {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d3EPz {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC,PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+#ifdef __NVCC__
+struct equations2d1E_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims=2;
+    //! number of fields in the system
+    static const unsigned int nvar=1;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d2E_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d3E_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d4E_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 4;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d1pE_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d2pE_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations2d3pE_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d3E_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    //! type of space float, double, ..
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d1E_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d3EPxz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+struct equations3d3EPz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC,PERIODIC};
+    //! type of space float, double, ...
+    typedef double stype;
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+    typedef umfpack_solver<double> solver_type;
+#endif //__NVCC__
diff --git a/src/DCPSE/DCPSE_op/tests/DCPSE_op_Solver_test.cpp b/src/DCPSE/DCPSE_op/tests/DCPSE_op_Solver_test.cpp
index a41ce710..133849d9 100644
--- a/src/DCPSE/DCPSE_op/tests/DCPSE_op_Solver_test.cpp
+++ b/src/DCPSE/DCPSE_op/tests/DCPSE_op_Solver_test.cpp
@@ -1001,6 +1001,24 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests)
         Solver.impose(-D_y, dw_p, prop_id<1>());
         Solver.impose(-D_x, l_p, prop_id<1>());
         Solver.impose(D_x, r_p, prop_id<1>());
+        Solver.reset_b();
+        Solver.impose_b(bulk, prop_id<1>());
+        Solver.impose_b(up_p, prop_id<1>());
+        Solver.impose_b(dw_p, prop_id<1>());
+        Solver.impose_b(l_p, prop_id<1>());
+        Solver.impose_b(r_p, prop_id<1>());
+        Solver.solve_with_solver(solver,sol);
+        Solver.reset_b();
+        Solver.impose_b(bulk, prop_id<1>());
+        Solver.impose_b(up_p, prop_id<1>());
+        Solver.impose_b(dw_p, prop_id<1>());
+        Solver.impose_b(l_p, prop_id<1>());
+        Solver.impose_b(r_p, prop_id<1>());
 //       Solver.solve(sol);
@@ -1022,6 +1040,174 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests)
+        BOOST_AUTO_TEST_CASE(dcpse_poisson_Neumann2d) {
+        const size_t sz[2] = {31,31};
+        Box<2, double> box({0, 0}, {1.0, 1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        double rCut = 3.1 * spacing;
+        Ghost<2, double> ghost(spacing * 3.1);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist<2, double, aggregate<double[2],double[2],double[2],double[2],double[2]>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_x Dx(domain, 2, rCut,1.9,support_options::N_PARTICLES);
+        Derivative_y Dy(domain, 2, rCut,1.9,support_options::N_PARTICLES);
+        Laplacian Lap(domain, 2, rCut, 1.9,support_options::N_PARTICLES);
+        petsc_solver<double> solver;
+        solver.setRestart(500);
+        solver.setSolver(KSPGMRES);
+        solver.setPreconditioner(PCSVD);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            //domain.getProp<3>(p)=1+xp[0]*xp[0]+2*xp[1]*xp[1];
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] =  sin(5*xp.get(0));
+                domain.getProp<1>(p)[1] =  sin(5*xp.get(0));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] =  sin(5*xp.get(0));
+                domain.getProp<1>(p)[1] =  sin(5*xp.get(0));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] =  sin(5*xp.get(0));
+                domain.getProp<1>(p)[1] =  sin(5*xp.get(0));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] =  sin(5*xp.get(0));
+                domain.getProp<1>(p)[1] =  sin(5*xp.get(0));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] =  -10*exp(-((xp.get(0)-0.5)*(xp.get(0)-0.5)+(xp.get(1)-0.5)*(xp.get(1)-0.5))/0.02);
+                domain.getProp<1>(p)[1] =  -10*exp(-((xp.get(0)-0.5)*(xp.get(0)-0.5)+(xp.get(1)-0.5)*(xp.get(1)-0.5))/0.02);
+            }
+            ++it2;
+        }
+        int i=0;
+        while(i==0)
+            {sleep(400);}
+        DCPSE_scheme<equations2d2,decltype(domain)> Solver(domain,options_solver::LAGRANGE_MULTIPLIER);
+        eq_id vx,vy;
+        vx.setId(0);
+        vy.setId(1);
+        auto Poisson0 = -Lap(v[0]);
+        auto D_x0 = Dx(v[0]);
+        auto D_y0 = Dy(v[0]);
+        auto Poisson1 = -Lap(v[1]);
+        auto D_x1 = Dx(v[1]);
+        auto D_y1 = Dy(v[1]);
+        Solver.impose(Poisson0, bulk, RHS[0],vx);
+        Solver.impose(Poisson1, bulk, RHS[1],vy);
+        Solver.impose(D_y0, up_p, RHS[0],vx);
+        Solver.impose(-D_y0, dw_p, RHS[0],vx);
+        Solver.impose(-D_x0, l_p, RHS[0],vx);
+        Solver.impose(D_x0, r_p, RHS[0],vx);
+        Solver.impose(D_y1, up_p, RHS[1],vy);
+        Solver.impose(-D_y1, dw_p, RHS[1],vy);
+        Solver.impose(-D_x1, l_p, RHS[1],vy);
+        Solver.impose(D_x1, r_p, RHS[1],vy);
+        Solver.solve_with_solver(solver,sol[0],sol[1]);
+//       Solver.solve(sol);
+        domain.ghost_get<2>();
+        anasol[0]=-Lap(sol[0]);
+        anasol[1]=-Lap(sol[1]);
+        double worst1 = 0.0,worst2 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p)[0]- domain.getProp<1>(p)[0]) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p)[0] - domain.getProp<1>(p)[0]);
+            }
+            if (fabs(domain.getProp<3>(p)[1]- domain.getProp<1>(p)[1]) >= worst2) {
+                worst2 = fabs(domain.getProp<3>(p)[1] - domain.getProp<1>(p)[1]);
+            }
+            domain.getProp<4>(p)[0] = fabs(domain.getProp<1>(p)[0] - domain.getProp<3>(p)[0]);
+            domain.getProp<4>(p)[1] = fabs(domain.getProp<1>(p)[1] - domain.getProp<3>(p)[1]);
+        }
+        //Auto Error
+        BOOST_REQUIRE(worst1 < 1.0);
+        BOOST_REQUIRE(worst2 < 1.0);
+        domain.write("Neumann2d");
+    }
     BOOST_AUTO_TEST_CASE(dcpse_slice_solver) {
 //  int rank;
@@ -1162,7 +1348,7 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests)
-        DCPSE_scheme<equations2d2,decltype(domain)> Solver( domain);
+        DCPSE_scheme<equations2d2,decltype(domain)> Solver(domain);
         auto Poisson0 = Lap(v[0]);
         auto Poisson1 = Lap(v[1]);
         //auto D_x = Dx(v[1]);
diff --git a/src/DCPSE/DCPSE_op/tests/ b/src/DCPSE/DCPSE_op/tests/
new file mode 100644
index 00000000..7bcc95b7
--- /dev/null
+++ b/src/DCPSE/DCPSE_op/tests/
@@ -0,0 +1,1333 @@
+ *
+ *
+ *  Created on: Jan 7, 2020
+ *      Author: Abhinav Singh, Pietro Incardona, Serhii
+ *
+ */
+#include "config.h"
+#include "util/util_debug.hpp"
+#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include "../DCPSE_op.hpp"
+#include "../DCPSE_Solver.hpp"
+#include "../DCPSE_Solver.cuh"
+#include "Operators/Vector/vector_dist_operators.hpp"
+#include "Vector/vector_dist_subset.hpp"
+#include "../EqnsStruct.hpp"
+#include "Decomposition/Distribution/SpaceDistribution.hpp"
+BOOST_AUTO_TEST_CASE(dcpse_op_vec3d_gpu) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        size_t edgeSemiSize = 257;
+        const size_t sz[3] = {edgeSemiSize,  edgeSemiSize,edgeSemiSize};
+        Box<3, double> box({0, 0,0}, {1,1,1});
+        size_t bc[3] = {NON_PERIODIC, NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        double rCut = 3.1 * spacing;
+        Ghost<3, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing * spacing/ (2 * 4);
+        vector_dist_gpu<3, double, aggregate<double, VectorS<3, double>, VectorS<3, double>, VectorS<3, double>, VectorS<3, double>,double,double>> domain(
+        0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double x = k0 * spacing;
+            domain.getLastPos()[0] = x;//+ gaussian(rng);
+            mem_id k1 = key.get(1);
+            double y = k1 * spacing;
+            domain.getLastPos()[1] = y;//+gaussian(rng);
+            mem_id k2 = key.get(2);
+            double z = k2 * spacing;
+            domain.getLastPos()[2] = z;//+gaussian(rng);
+            // Here fill the function value
+            domain.template getLastProp<0>()    = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]) + sin(domain.getLastPos()[2]) ;
+            domain.template getLastProp<1>()[0] = cos(domain.getLastPos()[0]);
+            domain.template getLastProp<1>()[1] = cos(domain.getLastPos()[1]) ;
+            domain.template getLastProp<1>()[2] = cos(domain.getLastPos()[2]);
+            // Here fill the validation value for Df/Dx
+            domain.template getLastProp<2>()[0] = 0;//cos(domain.getLastPos()[0]);//+cos(domain.getLastPos()[1]);
+            domain.template getLastProp<2>()[1] = 0;//-sin(domain.getLastPos()[0]);//+cos(domain.getLastPos()[1]);
+            domain.template getLastProp<3>()[0] = 0;//cos(domain.getLastPos()[0]);//+cos(domain.getLastPos()[1]);
+            domain.template getLastProp<3>()[1] = 0;//-sin(domain.getLastPos()[0]);//+cos(domain.getLastPos()[1]);
+            domain.template getLastProp<3>()[2] = 0;
+            domain.template getLastProp<4>()[0] = -cos(domain.getLastPos()[0]) * sin(domain.getLastPos()[0]);
+            domain.template getLastProp<4>()[1] = -cos(domain.getLastPos()[1]) * sin(domain.getLastPos()[1]);
+            domain.template getLastProp<4>()[2] = -cos(domain.getLastPos()[2]) * sin(domain.getLastPos()[2]);
+            /*  domain.template getLastProp<4>()[0] = cos(domain.getLastPos()[0]) * (sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1])) +
+                                                    cos(domain.getLastPos()[1]) * (cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]));
+              domain.template getLastProp<4>()[1] = -sin(domain.getLastPos()[0]) * (sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1])) -
+                                                    sin(domain.getLastPos()[1]) * (cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]));
+              domain.template getLastProp<4>()[2] = -sin(domain.getLastPos()[0]) * (sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1])) -
+                                                    sin(domain.getLastPos()[1]) * (cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]));*/
+            domain.template getLastProp<5>()    = cos(domain.getLastPos()[0]) * cos(domain.getLastPos()[0])+cos(domain.getLastPos()[1]) * cos(domain.getLastPos()[1])+cos(domain.getLastPos()[2]) * cos(domain.getLastPos()[2]) ;
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Advection_gpu Adv(domain, 2, rCut, 1.9,support_options::RADIUS);
+        auto v = getV<1>(domain);
+        auto P = getV<0>(domain);
+        auto dv = getV<3>(domain);
+        auto dP = getV<6>(domain);
+//        typedef boost::mpl::int_<std::is_fundamental<point_expression_op<Point<2U, double>, point_expression<double>, Point<2U, double>, 3>>::value>::blabla blabla;
+//        std::is_fundamental<decltype(o1.value(key))>
+        domain.ghost_get<1>();
+        dv = Adv(v, v);
+        auto it2 = domain.getDomainIterator();
+        double worst1 = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(domain.getProp<3>(p)[1] - domain.getProp<4>(p)[1]) > worst1) {
+                worst1 = fabs(domain.getProp<3>(p)[1] - domain.getProp<4>(p)[1]);
+            }
+            ++it2;
+        }
+        //std::cout << "Maximum Error in component 2: " << worst1 << std::endl;
+        BOOST_REQUIRE(worst1 < 0.03);
+        //Adv.checkMomenta(domain);
+        //Adv.DrawKernel<2>(domain,0);
+        //domain.deleteGhost();
+        dP = Adv(v, P);//+Dy(P);
+        auto it3 = domain.getDomainIterator();
+        double worst2 = 0.0;
+        while (it3.isNext()) {
+            auto p = it3.get();
+            if (fabs(domain.getProp<6>(p) - domain.getProp<5>(p)) > worst2) {
+                worst2 = fabs(domain.getProp<6>(p) - domain.getProp<5>(p));
+            }
+            ++it3;
+        }
+        domain.deleteGhost();
+        BOOST_REQUIRE(worst2 < 0.03);
+    BOOST_AUTO_TEST_CASE(dcpse_op_solver) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {31, 31};
+        Box<2, double> box({0, 0}, {1.0, 1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<2, double> ghost(spacing * 3);
+        double rCut = 3.1 * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double,double,double,double>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Laplacian_gpu Lap(domain, 2, rCut);
+        DCPSE_scheme_gpu<equations2d1_gpu,decltype(domain)> Solver( domain);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        auto v = getV<0>(domain);
+        auto RHS = getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> up_d({box.getLow(0) - spacing / 2.0, box.getHigh(1) - 8*spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - 6*spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> down_u({box.getLow(0) - spacing / 2.0, box.getLow(1) + 3*spacing / 2.0},
+                              {box.getHigh(0) + spacing / 2.0, box.getLow(1) + 4*spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> left_r({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                              {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right_l({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                               {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(up_d);
+        boxes.add(down);
+        boxes.add(down_u);
+        boxes.add(left);
+        boxes.add(left_r);
+        boxes.add(right);
+        boxes.add(right_l);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            domain.getProp<2>(p)=1+xp[0]*xp[0]+2*xp[1]*xp[1];
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  3 + xp.get(0)*xp.get(0);
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  1 + xp.get(0)*xp.get(0);
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  1 + 2*xp.get(1)*xp.get(1);
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  2 + 2*xp.get(1)*xp.get(1);
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+            }
+            ++it2;
+        }
+        auto eq1 = Lap(v);
+        Solver.impose(eq1, bulk, 6);
+        Solver.impose(v, up_p, RHS);
+        Solver.impose(v, dw_p, RHS);
+        Solver.impose(v, l_p, prop_id<1>());
+        Solver.impose(v, r_p, prop_id<1>());
+        Solver.solve(v);
+        anasol=Lap(v);
+        double worst1 = 0.0;
+        it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(domain.getProp<0>(p) - domain.getProp<2>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<0>(p) - domain.getProp<2>(p));
+            }
+            domain.getProp<1>(p) = fabs(domain.getProp<0>(p) - domain.getProp<2>(p));
+            ++it2;
+        }
+        domain.write("particles");
+        BOOST_REQUIRE(worst1 < 0.03);
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_poisson_Robin_anal) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {81,81};
+        Box<2, double> box({0, 0}, {0.5, 0.5});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<2, double> ghost(spacing * 3.1);
+        double rCut = 3.1 * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double,double,double,double,double,double>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        // Add multi res patch 1
+        {
+        const size_t sz2[2] = {40,40};
+        Box<2,double> bx({0.25 + it.getSpacing(0)/4.0,0.25 + it.getSpacing(0)/4.0},{sz2[0]*it.getSpacing(0)/2.0 + 0.25 + it.getSpacing(0)/4.0, sz2[1]*it.getSpacing(0)/2.0 + 0.25 + it.getSpacing(0)/4.0});
+        openfpm::vector<size_t> rem;
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+            auto k = it.get();
+            Point<2,double> xp = domain.getPos(k);
+            if (bx.isInside(xp) == true)
+            {
+                rem.add(k.getKey());
+            }
+            ++it;
+        }
+        domain.remove(rem);
+        auto it2 = domain.getGridIterator(sz2);
+        while (it2.isNext()) {
+            domain.add();
+            auto key = it2.get();
+            double x = key.get(0) * spacing/2.0 + 0.25 + spacing/4.0;
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * spacing/2.0 + 0.25 + spacing/4.0;
+            domain.getLastPos()[1] = y;
+            ++it2;
+        }
+        }
+        // Add multi res patch 2
+        {
+        const size_t sz2[2] = {40,40};
+        Box<2,double> bx({0.25 + 21.0*spacing/8.0,0.25 + 21.0*spacing/8.0},{sz2[0]*spacing/4.0 + 0.25 + 21.0*spacing/8.0, sz2[1]*spacing/4.0 + 0.25 + 21*spacing/8.0});
+        openfpm::vector<size_t> rem;
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+            auto k = it.get();
+            Point<2,double> xp = domain.getPos(k);
+            if (bx.isInside(xp) == true)
+            {
+                rem.add(k.getKey());
+            }
+            ++it;
+        }
+        domain.remove(rem);
+        auto it2 = domain.getGridIterator(sz2);
+        while (it2.isNext()) {
+            domain.add();
+            auto key = it2.get();
+            double x = key.get(0) * spacing/4.0 + 0.25 + 21*spacing/8.0;
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * spacing/4.0 + 0.25 + 21*spacing/8.0;
+            domain.getLastPos()[1] = y;
+            ++it2;
+        }
+        }
+        ///////////////////////
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_x_gpu Dx(domain, 2, rCut / 3.0 ,1.9/*,support_options::RADIUS*/);
+        Derivative_y_gpu Dy(domain, 2, rCut / 3.0 ,1.9/*,support_options::RADIUS*/);
+        Laplacian_gpu Lap(domain, 2, rCut / 3.0 ,1.9/*,support_options::RADIUS*/);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        openfpm::vector<aggregate<int>> ref_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto DCPSE_sol=getV<5>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            }
+            ++it2;
+        }
+        domain.ghost_get<1,3>();
+        DCPSE_scheme_gpu<equations2d1_gpu,decltype(domain)> Solver( domain);
+        auto Poisson = Lap(v);
+        auto D_x = Dx(v);
+        auto D_y = Dy(v);
+        Solver.impose(Poisson, bulk, prop_id<1>());
+        Solver.impose(D_y, up_p, 0);
+        Solver.impose(D_x, r_p, 0);
+        Solver.impose(v, dw_p, 0);
+        Solver.impose(v, l_p, 0);
+        petsc_solver<double> solver;
+        solver.setPreconditioner(PCBJACOBI);
+        solver.setRestart(500);
+        Solver.solve_with_solver(solver,sol);
+        //solver.print_preconditioner();
+        domain.ghost_get<2>();
+        DCPSE_sol=Lap(sol);
+        domain.ghost_get<5>();
+        double worst1 = 0.0;
+        v=abs(DCPSE_sol-RHS);
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p) - domain.getProp<2>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p) - domain.getProp<2>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<3>(p) - domain.getProp<2>(p));
+        }
+        //std::cout << "Maximum Analytic Error: " << worst1 << std::endl;
+        //domain.ghost_get<4>();
+        //domain.write("Robin_anasol");
+        BOOST_REQUIRE(worst1 < 0.03);
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //81 0.00131586
+    //161 0.000328664
+    //320 8.30297e-05
+    //520 3.12398e-05
+    //1024 8.08087e-06
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    BOOST_AUTO_TEST_CASE(dcpse_poisson_Dirichlet_anal) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {200,200};
+        Box<2, double> box({0, 0}, {1, 1});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<2, double> ghost(spacing * 3.1);
+        double rCut = 3.1 * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double,double,double,double,double,double>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_x_gpu Dx(domain, 2, rCut,1.9,support_options::RADIUS);
+        Derivative_y_gpu Dy(domain, 2, rCut,1.9,support_options::RADIUS);
+        Laplacian_gpu Lap(domain, 2, rCut, 1.9,support_options::RADIUS);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> bulkF;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        openfpm::vector<aggregate<int>> ref_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto DCPSE_sol=getV<5>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                bulkF.add();
+                bulkF.last().get<0>() = p.getKey();
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                bulkF.add();
+                bulkF.last().get<0>() = p.getKey();
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                bulkF.add();
+                bulkF.last().get<0>() = p.getKey();
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                bulkF.add();
+                bulkF.last().get<0>() = p.getKey();
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                bulkF.add();
+                bulkF.last().get<0>() = p.getKey();
+            }
+            ++it2;
+        }
+        DCPSE_scheme_gpu<equations2d1_gpu,decltype(domain)> Solver( domain);
+        auto Poisson = Lap(v);
+        auto D_x = Dx(v);
+        auto D_y = Dy(v);
+        Solver.impose(Poisson, bulk, prop_id<1>());
+        Solver.impose(v, up_p, prop_id<1>());
+        Solver.impose(v, r_p, prop_id<1>());
+        Solver.impose(v, dw_p, prop_id<1>());
+        Solver.impose(v, l_p, prop_id<1>());
+        Solver.solve(sol);
+        DCPSE_sol=Lap(sol);
+        for (int j = 0; j < up_p.size(); j++) {
+            auto p = up_p.get<0>(j);
+            domain.getProp<5>(p) = 0;
+        }
+        for (int j = 0; j < dw_p.size(); j++) {
+            auto p = dw_p.get<0>(j);
+            domain.getProp<5>(p) = 0;
+        }
+        for (int j = 0; j < l_p.size(); j++) {
+            auto p = l_p.get<0>(j);
+            domain.getProp<5>(p) = 0;
+        }
+        for (int j = 0; j < r_p.size(); j++) {
+            auto p = r_p.get<0>(j);
+            domain.getProp<5>(p) = 0;
+        }
+        double worst1 = 0.0;
+        v=abs(DCPSE_sol-RHS);
+        for(int j=0;j<bulkF.size();j++)
+        {   auto p=bulkF.get<0>(j);
+            if (fabs(domain.getProp<3>(p) - domain.getProp<2>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p) - domain.getProp<2>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<3>(p) - domain.getProp<2>(p));
+        }
+       // std::cout << "Maximum Analytic Error: " << worst1 << std::endl;
+        BOOST_REQUIRE(worst1 < 0.03);
+       // domain.write("Dirichlet_anasol");
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    BOOST_AUTO_TEST_CASE(dcpse_poisson_Periodic) {
+        //
+        //  int rank;
+        //  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {31,31};
+        Box<2, double> box({0, 0}, {1.0, 1.0});
+        size_t bc[2] = {PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<2, double> ghost(spacing * 3.1);
+        double rCut = 3.1 * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double,double,double,double,double,VectorS<2, double>>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1)*0.99999;
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Laplacian_gpu Lap(domain, 2, rCut, 1.9, support_options::RADIUS);
+        DCPSE_scheme_gpu<equations2d1p_gpu,decltype(domain)> Solver( domain);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto u = getV<5>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            //domain.getProp<3>(p)=1+xp[0]*xp[0]+2*xp[1]*xp[1];
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  0;
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  0;
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) = xp.get(0)*sin(5*M_PI*xp.get(1))+exp(-((xp.get(0)-0.5)*(xp.get(0)-0.5)+(xp.get(1)-0.5)*(xp.get(1)-0.5))/0.02);
+            }
+            ++it2;
+        }
+        domain.ghost_get<1>();
+        auto Poisson = -Lap(v);
+        Solver.impose(Poisson, bulk, prop_id<1>());
+        Solver.impose(v, up_p, 0);
+        Solver.impose(v, dw_p, 0);
+        Solver.solve(v);
+        domain.ghost_get<0>();
+        anasol=-Lap(v);
+        double worst1 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p) - domain.getProp<1>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p) - domain.getProp<1>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<1>(p) - domain.getProp<3>(p));
+        }
+        //Auto Error
+        BOOST_REQUIRE(worst1 < 1.0);
+        //domain.write("Poisson_Periodic");
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    BOOST_AUTO_TEST_CASE(dcpse_poisson_Robin) {
+        //
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {31,31};
+        Box<2, double> box({0, 0}, {1.0, 1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<2, double> ghost(spacing * 3);
+        double rCut = 3.1 * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double,double,double,double,double,VectorS<2, double>>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_y_gpu Dy(domain, 2, rCut);
+        Laplacian_gpu Lap(domain, 2, rCut);
+        DCPSE_scheme_gpu<equations2d1_gpu,decltype(domain)> Solver(domain);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        auto v = getV<0>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto u = getV<5>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            //domain.getProp<3>(p)=1+xp[0]*xp[0]+2*xp[1]*xp[1];
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5.0*xp.get(0));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5.0*xp.get(0));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5.0*xp.get(0));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5.0*xp.get(0));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -10.0*exp(-((xp.get(0)-0.5)*(xp.get(0)-0.5)+(xp.get(1)-0.5)*(xp.get(1)-0.5))/0.02);
+            }
+            ++it2;
+        }
+        petsc_solver<double> pet_sol;
+        pet_sol.setPreconditioner(PCNONE);
+        auto Poisson = Lap(v);
+        auto D_y = Dy(v);
+        Solver.impose(Poisson, bulk, prop_id<1>());
+        Solver.impose(D_y, up_p, prop_id<1>());
+        Solver.impose(-D_y, dw_p, prop_id<1>());
+        Solver.impose(v, l_p, 0);
+        Solver.impose(v, r_p, 0);
+        Solver.solve_with_solver(pet_sol,sol);
+        domain.ghost_get<2>();
+        anasol=Lap(sol);
+        double worst1 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p) - domain.getProp<1>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p) - domain.getProp<1>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<1>(p) - domain.getProp<3>(p));
+        }
+        //Auto Error
+        BOOST_REQUIRE(worst1 < 1.0);
+        //std::cout << "WORST: " << worst1 << std::endl;
+        //domain.write("Mixed");
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    BOOST_AUTO_TEST_CASE(dcpse_poisson_Neumann) {
+    //
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {31,31};
+        Box<2, double> box({0, 0}, {1.0, 1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        double rCut = 3.1 * spacing;
+        Ghost<2, double> ghost(spacing * 3.1);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double,double,double,double,double>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_x_gpu Dx(domain, 2, rCut);
+        Derivative_y_gpu Dy(domain, 2, rCut);
+        Laplacian_gpu Lap(domain, 2, rCut);
+        petsc_solver<double> solver;
+        solver.setRestart(500);
+        solver.setSolver(KSPGMRES);
+        solver.setPreconditioner(PCSVD);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        auto v = getV<0>(domain);
+        //auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            //domain.getProp<3>(p)=1+xp[0]*xp[0]+2*xp[1]*xp[1];
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5*xp.get(0));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5*xp.get(0));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5*xp.get(0));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  sin(5*xp.get(0));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -10*exp(-((xp.get(0)-0.5)*(xp.get(0)-0.5)+(xp.get(1)-0.5)*(xp.get(1)-0.5))/0.02);
+            }
+            ++it2;
+        }
+        DCPSE_scheme_gpu<equations2d1_gpu,decltype(domain)> Solver(domain,options_solver::LAGRANGE_MULTIPLIER);
+        auto Poisson = -Lap(v);
+        auto D_x = Dx(v);
+        auto D_y = Dy(v);
+        Solver.impose(Poisson, bulk, prop_id<1>());
+        Solver.impose(D_y, up_p, prop_id<1>());
+        Solver.impose(-D_y, dw_p, prop_id<1>());
+        Solver.impose(-D_x, l_p, prop_id<1>());
+        Solver.impose(D_x, r_p, prop_id<1>());
+        Solver.solve_with_solver(solver,sol);
+//       Solver.solve(sol);
+        domain.ghost_get<2>();
+        anasol=-Lap(sol);
+        double worst1 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p) - domain.getProp<1>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p) - domain.getProp<1>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<1>(p) - domain.getProp<3>(p));
+        }
+        //Auto Error
+        BOOST_REQUIRE(worst1 < 1.0);
+        //domain.write("Neumann");
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_slice_solver) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[2] = {31,31};
+        Box<2, double> box({0, 0}, {1, 1});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        double rCut = 3.1 * spacing;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<VectorS<2, double>,VectorS<2, double>,VectorS<2, double>,VectorS<2, double>,VectorS<2, double>,VectorS<2, double>>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_x_gpu Dx(domain, 2, rCut,1.9,support_options::RADIUS);
+        Derivative_y_gpu Dy(domain, 2, rCut,1.9,support_options::RADIUS);
+        Laplacian_gpu Lap(domain, 2, rCut,1.9,support_options::RADIUS);
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        openfpm::vector<aggregate<int>> ref_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto DCPSE_sol=getV<5>(domain);
+        // Here fill me
+        Box<2, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0});
+        Box<2, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0});
+        Box<2, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        Box<2, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0});
+        openfpm::vector<Box<2, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<2, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+       // domain.write("Slice_anasol");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = domain.getPos(p);
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[0] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<1>(p)[1] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[1] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[0] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<1>(p)[1] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[1] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[0] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<1>(p)[1] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[1] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[0] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<1>(p)[1] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[1] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p)[0] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[0] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<1>(p)[1] = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p)[1] = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            }
+            ++it2;
+        }
+        eq_id vx,vy;
+        vx.setId(0);
+        vy.setId(1);
+        DCPSE_scheme_gpu<equations2d2_gpu,decltype(domain)> Solver( domain);
+        auto Poisson0 = Lap(v[0]);
+        auto Poisson1 = Lap(v[1]);
+        //auto D_x = Dx(v[1]);
+        //auto D_y = Dy(v[1]);
+        Solver.impose(Poisson0, bulk, RHS[0],vx);
+        Solver.impose(Poisson1, bulk, RHS[1],vy);
+        Solver.impose(v[0], up_p, RHS[0],vx);
+        Solver.impose(v[1], up_p, RHS[1],vy);
+        Solver.impose(v[0], r_p,  RHS[0],vx);
+        Solver.impose(v[1], r_p,  RHS[1],vy);
+        Solver.impose(v[0], dw_p, RHS[0],vx);
+        Solver.impose(v[1], dw_p, RHS[1],vy);
+        Solver.impose(v[0], l_p,  RHS[0],vx);
+        Solver.impose(v[1], l_p,  RHS[1],vy);
+        Solver.solve(sol[0],sol[1]);
+        DCPSE_sol=Lap(sol);
+        double worst1 = 0.0;
+        double worst2 = 0.0;
+        v=sol-RHS;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p)[0] - domain.getProp<2>(p)[0]) >= worst1) {
+                worst1 = fabs(domain.getProp<3>(p)[0] - domain.getProp<2>(p)[0]);
+            }
+            domain.getProp<4>(p)[0] = fabs(domain.getProp<3>(p)[0] - domain.getProp<2>(p)[0]);
+        }
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<3>(p)[1] - domain.getProp<2>(p)[1]) >= worst2) {
+                worst2 = fabs(domain.getProp<3>(p)[1] - domain.getProp<2>(p)[1]);
+            }
+            domain.getProp<4>(p)[1] = fabs(domain.getProp<3>(p)[1] - domain.getProp<2>(p)[1]);
+        }
+        //std::cout << "Maximum Analytic Error in slice x: " << worst1 << std::endl;
+        //std::cout << "Maximum Analytic Error in slice y: " << worst2 << std::endl;
+        //domain.write("Slice_anasol");
+        BOOST_REQUIRE(worst1 < 0.03);
+        BOOST_REQUIRE(worst2 < 0.03);
+    }
diff --git a/src/DCPSE/DCPSE_op/tests/DCPSE_op_Surface_tests.cpp b/src/DCPSE/DCPSE_op/tests/DCPSE_op_Surface_tests.cpp
new file mode 100644
index 00000000..1cbe5ee1
--- /dev/null
+++ b/src/DCPSE/DCPSE_op/tests/DCPSE_op_Surface_tests.cpp
@@ -0,0 +1,1112 @@
+// Created by Abhinav Singh on 15.11.21.
+#include "config.h"
+#ifdef HAVE_EIGEN
+#ifdef HAVE_PETSC
+#include "util/util_debug.hpp"
+#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include "../DCPSE_surface_op.hpp"
+#include "../DCPSE_Solver.hpp"
+#include "Operators/Vector/vector_dist_operators.hpp"
+#include "Vector/vector_dist_subset.hpp"
+#include <iostream>
+#include "util/SphericalHarmonics.hpp"
+    BOOST_AUTO_TEST_CASE(dcpse_surface_simple) {
+        double boxP1{-1.5}, boxP2{1.5};
+        double boxSize{boxP2 - boxP1};
+        size_t n=256;
+        size_t sz[2] = {n,n};
+        double grid_spacing{boxSize/(sz[0]-1)};
+        double rCut{3.9 * grid_spacing};
+        Box<2,double> domain{{boxP1,boxP1},{boxP2,boxP2}};
+        size_t bc[2] = {NON_PERIODIC,NON_PERIODIC};
+        Ghost<2,double> ghost{rCut + grid_spacing/8.0};
+        auto &v_cl=create_vcluster();
+        vector_dist_ws<2, double, aggregate<double,double,double[2],double,double[2]>> Sparticles(0, domain,bc,ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        // 1. Particles on a line
+        if (v_cl.rank() == 0) {
+            for (int i = 0; i < n; ++i) {
+                double xp = -1.5+i*grid_spacing;
+                Sparticles.add();
+                Sparticles.getLastPos()[0] = xp;
+                Sparticles.getLastPos()[1] = 0;
+                Sparticles.getLastProp<3>() = std::sin(xp);
+                Sparticles.getLastProp<2>()[0] = 0;
+                Sparticles.getLastProp<2>()[1] = 1.0;
+                Sparticles.getLastProp<1>() = -std::sin(xp);//sin(theta)*exp(-finalT/(radius*radius));
+                Sparticles.getLastSubset(0);
+            }
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        Sparticles.ghost_get<0,3>();
+        //Sparticles.write("Sparticles");
+        //Here template parameters are Normal property no.
+        SurfaceDerivative_xx<2> SDxx(Sparticles, 2, rCut,grid_spacing);
+        SurfaceDerivative_yy<2> SDyy(Sparticles, 2, rCut,grid_spacing);
+        //SurfaceDerivative_x<2> SDx(Sparticles, 4, rCut,grid_spacing);
+        //SurfaceDerivative_y<2> SDy(Sparticles, 4, rCut,grid_spacing);
+        auto INICONC = getV<3>(Sparticles);
+        auto CONC = getV<0>(Sparticles);
+        auto TEMP = getV<4>(Sparticles);
+        auto normal = getV<2>(Sparticles);
+        //auto ANASOL = getV<1>(domain);
+        //TEMP[0]=(-normal[0]*normal[0]+1.0) * SDx(INICONC) - normal[0]*normal[1] * SDy(INICONC);
+        //TEMP[1]=(-normal[1]*normal[1]+1.0) * SDy(INICONC) - normal[0]*normal[1] * SDx(INICONC);
+        //Sparticles.ghost_get<4>();
+        //CONC=SDxx(TEMP[0]) + SDyy(TEMP[1]);
+        auto it2 = Sparticles.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p)) > worst) {
+                worst = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+            }
+            ++it2;
+        }
+        Sparticles.deleteGhost();
+        //std::cout<<v_cl.rank()<<":WORST:"<<worst<<std::endl;
+        //Sparticles.write("Sparticles");
+        BOOST_REQUIRE(worst < 0.03);
+    BOOST_AUTO_TEST_CASE(dcpse_surface_circle) {
+        double boxP1{-1.5}, boxP2{1.5};
+        double boxSize{boxP2 - boxP1};
+        size_t n=512;
+        auto &v_cl=create_vcluster();
+        //std::cout<<v_cl.rank()<<":Enter res: "<<std::endl;
+        //std::cin>>n;
+        size_t sz[2] = {n,n};
+        double grid_spacing{boxSize/(sz[0]-1)};
+        double rCut{5.1 * grid_spacing};
+        Box<2,double> domain{{boxP1,boxP1},{boxP2,boxP2}};
+        size_t bc[2] = {NON_PERIODIC,NON_PERIODIC};
+        Ghost<2,double> ghost{rCut + grid_spacing/8.0};
+        vector_dist_ws<2, double, aggregate<double,double,double[2],double,double[2],double>> Sparticles(0, domain,bc,ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        // Surface prameters
+        const double radius{1.0};
+        std::array<double,2> center{0.0,0.0};
+        Point<2,double> coord;
+        const double pi{3.14159265358979323846};
+        // 1. Particles on surface
+        double theta{0.0};
+        double dtheta{2*pi/double(n)};
+        if (v_cl.rank() == 0) {
+            for (int i = 0; i < n; ++i) {
+                coord[0] = center[0] + radius * std::cos(theta);
+                coord[1] = center[1] + radius * std::sin(theta);
+                Sparticles.add();
+                Sparticles.getLastPos()[0] = coord[0];
+                Sparticles.getLastPos()[1] = coord[1];
+                Sparticles.getLastProp<3>() = std::sin(theta);
+                Sparticles.getLastProp<2>()[0] = std::cos(theta);
+                Sparticles.getLastProp<2>()[1] = std::sin(theta);
+                Sparticles.getLastProp<1>() = -std::sin(theta);;//sin(theta)*exp(-finalT/(radius*radius));
+                Sparticles.getLastSubset(0);
+                theta += dtheta;
+            }
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        Sparticles.ghost_get<0,3>();
+        //Sparticles.write("Sparticles");
+        //Here template parameters are Normal property no.
+        SurfaceDerivative_xx<2> SDxx(Sparticles, 2, rCut,grid_spacing);
+        SurfaceDerivative_yy<2> SDyy(Sparticles, 2, rCut,grid_spacing);
+        //SurfaceDerivative_xy<2> SDxy(Sparticles, 3, rCut,grid_spacing);
+        //SurfaceDerivative_x<2> SDx(Sparticles, 3, rCut,grid_spacing);
+        //SurfaceDerivative_y<2> SDy(Sparticles, 3, rCut,grid_spacing);
+        auto INICONC = getV<3>(Sparticles);
+        auto CONC = getV<0>(Sparticles);
+        auto TEMP = getV<4>(Sparticles);
+        auto normal = getV<2>(Sparticles);
+        //TEMP[0]=(-normal[0]*normal[0]+1.0) * SDx(INICONC) - normal[0]*normal[1] * SDy(INICONC);
+        //TEMP[1]=(-normal[1]*normal[1]+1.0) * SDy(INICONC) - normal[0]*normal[1] * SDx(INICONC);
+        //TEMP[0]=(-normal[0]*normal[0]+1.0);
+        //TEMP[1]=normal[0]*normal[1];
+        //Sparticles.ghost_get<2,4>();
+        //CONC=SDx(TEMP[0]) + SDy(TEMP[1]);
+        //        (SDy((-normal[1]*normal[1]+1.0))*SDy(INICONC)+(-normal[1]*normal[1]+1.0)*SDyy(INICONC)-(SDy(TEMP[1])*SDx(INICONC)+TEMP[1]*SDxy(INICONC)));
+        auto it2 = Sparticles.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Sparticles.getProp<5>(p) = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+            if (fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p)) > worst) {
+                worst = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+            }
+            ++it2;
+        }
+        Sparticles.deleteGhost();
+        //Sparticles.write("Sparticles");
+        //std::cout<<worst;
+        BOOST_REQUIRE(worst < 0.03);
+    BOOST_AUTO_TEST_CASE(dcpse_surface_solver_circle) {
+        double boxP1{-1.5}, boxP2{1.5};
+        double boxSize{boxP2 - boxP1};
+        size_t n=512,k=2;
+        auto &v_cl=create_vcluster();
+        /*if(v_cl.rank()==0)
+        std::cout<<v_cl.rank()<<":Enter res: "<<std::endl;
+        std::cin>>n;
+        if(v_cl.rank()==0)
+        std::cout<<v_cl.rank()<<":Enter Freq: "<<std::endl;
+        std::cin>>k;*/
+        size_t sz[2] = {n,n};
+        double grid_spacing{boxSize/(sz[0]-1)};
+        double rCut{3.9 * grid_spacing};
+        Box<2,double> domain{{boxP1,boxP1},{boxP2,boxP2}};
+        size_t bc[2] = {NON_PERIODIC,NON_PERIODIC};
+        Ghost<2,double> ghost{rCut + grid_spacing/8.0};
+        vector_dist_ws<2, double, aggregate<double,double,double[2],double,double[2],double>> Sparticles(0, domain,bc,ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        // Surface prameters
+        const double radius{1.0};
+        std::array<double,2> center{0.0,0.0};
+        Point<2,double> coord;
+        const double pi{3.14159265358979323846};
+        // 1. Particles on surface
+        double theta{0.0};
+        double dtheta{2*pi/double(n)};
+        if (v_cl.rank() == 0) {
+            for (int i = 0; i < n; ++i) {
+                coord[0] = center[0] + radius * std::cos(theta);
+                coord[1] = center[1] + radius * std::sin(theta);
+                Sparticles.add();
+                Sparticles.getLastPos()[0] = coord[0];
+                Sparticles.getLastPos()[1] = coord[1];
+                Sparticles.getLastProp<3>() = -openfpm::math::intpowlog(k,2)*std::sin(k*theta);
+                Sparticles.getLastProp<2>()[0] = std::cos(theta);
+                Sparticles.getLastProp<2>()[1] = std::sin(theta);
+                Sparticles.getLastProp<1>() = std::sin(k*theta);;//sin(theta)*exp(-finalT/(radius*radius));
+                Sparticles.getLastSubset(0);
+                if(coord[0]==1. && coord[1]==0.)
+                {Sparticles.getLastSubset(1);}
+                theta += dtheta;
+            }
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        Sparticles.ghost_get<0>();
+        //Sparticles.write("Sparticles");
+        vector_dist_subset<2, double, aggregate<double,double,double[2],double,double[2],double>> Sparticles_bulk(Sparticles,0);
+        vector_dist_subset<2, double, aggregate<double,double,double[2],double,double[2],double>> Sparticles_boundary(Sparticles,1);
+        auto & bulk=Sparticles_bulk.getIds();
+        auto & boundary=Sparticles_boundary.getIds();
+        //Here template parameters are Normal property no.
+        SurfaceDerivative_xx<2> SDxx(Sparticles, 2, rCut,grid_spacing);
+        SurfaceDerivative_yy<2> SDyy(Sparticles, 2, rCut,grid_spacing);
+        auto INICONC = getV<3>(Sparticles);
+        auto CONC = getV<0>(Sparticles);
+        auto TEMP = getV<4>(Sparticles);
+        auto normal = getV<2>(Sparticles);
+        auto ANASOL = getV<1>(Sparticles);
+        DCPSE_scheme<equations2d1,decltype(Sparticles)> Solver(Sparticles);
+        Solver.impose(SDxx(CONC)+SDyy(CONC), bulk, INICONC);
+        Solver.impose(CONC, boundary, ANASOL);
+        Solver.solve(CONC);
+        auto it2 = Sparticles.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Sparticles.getProp<5>(p) = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+            if (fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p)) > worst) {
+                worst = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+            }
+            ++it2;
+        }
+        Sparticles.deleteGhost();
+        //Sparticles.write("Sparticles");
+        //std::cout<<worst;
+        BOOST_REQUIRE(worst < 0.03);
+BOOST_AUTO_TEST_CASE(dcpse_surface_sphere) {
+  auto & v_cl = create_vcluster();
+  timer tt;
+  tt.start();
+  size_t n=512;
+  size_t n_sp=n;
+  // Domain
+  double boxP1{-1.5}, boxP2{1.5};
+  double boxSize{boxP2 - boxP1};
+  size_t sz[3] = {n,n,n};
+  double grid_spacing{boxSize/(sz[0]-1)};
+  double grid_spacing_surf=grid_spacing*30;
+  double rCut{2.5 * grid_spacing_surf};
+  Box<3,double> domain{{boxP1,boxP1,boxP1},{boxP2,boxP2,boxP2}};
+  Ghost<3,double> ghost{rCut + grid_spacing/8.0};
+  constexpr int K = 1;
+  // particles
+  vector_dist_ws<3, double, aggregate<double,double,double[3],double,double[3],double>> Sparticles(0, domain,bc,ghost);
+  // 1. particles on the Spherical surface
+  double Golden_angle=M_PI * (3.0 - sqrt(5.0));
+  if (v_cl.rank() == 0) {
+    //std::vector<Vector3f> data;
+    //GenerateSphere(1,data);
+    for(int i=1;i<n_sp;i++)
+        {
+            double y = 1.0 - (i /double(n_sp - 1.0)) * 2.0;
+            double radius = sqrt(1 - y * y);
+            double Golden_theta = Golden_angle * i;
+            double x = cos(Golden_theta) * radius;
+            double z = sin(Golden_theta) * radius;
+            Sparticles.add();
+            Sparticles.getLastPos()[0] = x;
+            Sparticles.getLastPos()[1] = y;
+            Sparticles.getLastPos()[2] = z;
+            double rm=sqrt(x*x+y*y+z*z);
+            Sparticles.getLastProp<2>()[0] = x/rm;
+            Sparticles.getLastProp<2>()[1] = y/rm;
+            Sparticles.getLastProp<2>()[2] = z/rm;
+            Sparticles.getLastProp<4>()[0] = 1.0 ;
+            Sparticles.getLastProp<4>()[1] = std::atan2(sqrt(x*x+y*y),z);
+            Sparticles.getLastProp<4>()[2] = std::atan2(y,x);
+            if(i<=2*(K)+1)
+            {Sparticles.getLastSubset(1);}
+            else
+            {Sparticles.getLastSubset(0);}
+        }
+    //std::cout << "n: " << n << " - grid spacing: " << grid_spacing << " - rCut: " << rCut << "Surf Normal spacing" << grid_spacing<<std::endl;
+  }
+  Sparticles.ghost_get<3>();
+  vector_dist_subset<3,double,aggregate<double,double,double[3],double,double[3],double>> Sparticles_bulk(Sparticles,0);
+  vector_dist_subset<3,double,aggregate<double,double,double[3],double,double[3],double>> Sparticles_boundary(Sparticles,1);
+  auto &bulkIds=Sparticles_bulk.getIds();
+  auto &bdrIds=Sparticles_boundary.getIds();
+  std::unordered_map<const lm,double,key_hash,key_equal> Alm;
+  //Setting max mode l_max
+  //Setting amplitudes to 1
+  for(int l=0;l<=K;l++){
+      for(int m=-l;m<=l;m++){
+          Alm[std::make_tuple(l,m)]=0;
+      }
+  }
+  Alm[std::make_tuple(1,0)]=1;
+  auto it2 = Sparticles.getDomainIterator();
+  while (it2.isNext()) {
+      auto p = it2.get();
+      Point<3, double> xP = Sparticles.getProp<4>(p);
+      /*double Sum=0;
+      for(int m=-spL;m<=spL;++m)
+      {
+        Sum+=openfpm::math::Y(spL,m,xP[1],xP[2]);
+      }*/
+      //Sparticles.getProp<ANADF>(p) = Sum;//openfpm::math::Y(K,K,xP[1],xP[2]);openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);;
+      Sparticles.getProp<3>(p)=openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<1>(p)=-(K)*(K+1)*openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      ++it2;
+  }
+  auto f=getV<3>(Sparticles);
+  auto Df=getV<0>(Sparticles);
+  SurfaceDerivative_xx<2> Sdxx{Sparticles,2,rCut,grid_spacing_surf};
+  SurfaceDerivative_yy<2> Sdyy{Sparticles,2,rCut,grid_spacing_surf};
+  SurfaceDerivative_zz<2> Sdzz{Sparticles,2,rCut,grid_spacing_surf};
+  //Laplace_Beltrami<2> SLap{Sparticles,2,rCut,grid_spacing_surf};
+  //Sdyy.DrawKernel<5>(Sparticles,0);
+  //Sdzz.DrawKernel<5>(Sparticles,0);
+/*  std::cout<<"SDXX:"<<std::endl;
+  Sdxx.checkMomenta(Sparticles);
+  std::cout<<"SDYY:"<<std::endl;
+  Sdyy.checkMomenta(Sparticles);
+  std::cout<<"SDZZ:"<<std::endl;
+  Sdzz.checkMomenta(Sparticles);*/
+  Sparticles.ghost_get<3>();
+  Df=(Sdxx(f)+Sdyy(f)+Sdzz(f));
+  //Df=SLap(f);
+  auto it3 = Sparticles.getDomainIterator();
+  double worst = 0.0;
+  while (it3.isNext()) {
+      auto p = it3.get();
+      //Sparticles.getProp<5>(p) = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+      if (fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p)) > worst) {
+          worst = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+      }
+      ++it3;
+  }
+        Sparticles.deleteGhost();
+        //Sparticles.write("Sparticles");
+        //std::cout<<worst;
+        BOOST_REQUIRE(worst < 0.03);
+BOOST_AUTO_TEST_CASE(dcpse_surface_sphere_old) {
+  auto & v_cl = create_vcluster();
+  timer tt;
+  tt.start();
+  size_t n=512;
+  size_t n_sp=n;
+  // Domain
+  double boxP1{-1.5}, boxP2{1.5};
+  double boxSize{boxP2 - boxP1};
+  size_t sz[3] = {n,n,n};
+  double grid_spacing{boxSize/(sz[0]-1)};
+  double grid_spacing_surf=grid_spacing*30;
+  double rCut{2.5 * grid_spacing_surf};
+  Box<3,double> domain{{boxP1,boxP1,boxP1},{boxP2,boxP2,boxP2}};
+  Ghost<3,double> ghost{rCut + grid_spacing/8.0};
+  constexpr int K = 1;
+  // particles
+  vector_dist_ws<3, double, aggregate<double,double,double[3],double,double[3],double>> Sparticles(0, domain,bc,ghost);
+  // 1. particles on the Spherical surface
+  double Golden_angle=M_PI * (3.0 - sqrt(5.0));
+  if (v_cl.rank() == 0) {
+    //std::vector<Vector3f> data;
+    //GenerateSphere(1,data);
+    std::unordered_map<const lm,double,key_hash,key_equal> Alm;
+          //Setting max mode l_max
+          //Setting amplitudes to 1
+          for(int l=0;l<=K;l++){
+              for(int m=-l;m<=l;m++){
+                  Alm[std::make_tuple(l,m)]=0;
+              }
+          }
+    Alm[std::make_tuple(1,0)]=1;
+    for(int i=1;i<n_sp;i++)
+        {
+            double y = 1.0 - (i /double(n_sp - 1.0)) * 2.0;
+            double radius = sqrt(1 - y * y);
+            double Golden_theta = Golden_angle * i;
+            double x = cos(Golden_theta) * radius;
+            double z = sin(Golden_theta) * radius;
+            Sparticles.add();
+            Sparticles.getLastPos()[0] = x;
+            Sparticles.getLastPos()[1] = y;
+            Sparticles.getLastPos()[2] = z;
+            double rm=sqrt(x*x+y*y+z*z);
+            Sparticles.getLastProp<2>()[0] = x/rm;
+            Sparticles.getLastProp<2>()[1] = y/rm;
+            Sparticles.getLastProp<2>()[2] = z/rm;
+            Sparticles.getLastProp<4>()[0] = 1.0 ;
+            Sparticles.getLastProp<4>()[1] = std::atan2(sqrt(x*x+y*y),z);
+            Sparticles.getLastProp<4>()[2] = std::atan2(y,x);
+            double m1=openfpm::math::sumY_Scalar<K>(1.0,std::atan2(sqrt(x*x+y*y),z),std::atan2(y,x),Alm);
+            double m2=-(K)*(K+1)*openfpm::math::sumY_Scalar<K>(1.0,std::atan2(sqrt(x*x+y*y),z),std::atan2(y,x),Alm);
+            Sparticles.getLastProp<3>()=m1;
+            Sparticles.getLastProp<1>()=m2;
+            Sparticles.getLastSubset(0);
+            for(int j=1;j<=2;++j){
+                Sparticles.add();
+            Sparticles.getLastPos()[0] = x+j*grid_spacing_surf*x/rm;
+            Sparticles.getLastPos()[1] = y+j*grid_spacing_surf*y/rm;
+            Sparticles.getLastPos()[2] = z+j*grid_spacing_surf*z/rm;
+            Sparticles.getLastProp<3>()=m1;
+            Sparticles.getLastSubset(1);
+            //Sparticles.getLastProp<1>(p)=m2;
+            Sparticles.add();
+            Sparticles.getLastPos()[0] = x-j*grid_spacing_surf*x/rm;
+            Sparticles.getLastPos()[1] = y-j*grid_spacing_surf*y/rm;
+            Sparticles.getLastPos()[2] = z-j*grid_spacing_surf*z/rm;
+            Sparticles.getLastProp<3>()=m1;
+            Sparticles.getLastSubset(1);
+            //Sparticles.getLastProp<1>(p)=m2;
+            }
+        }
+    //std::cout << "n: " << n << " - grid spacing: " << grid_spacing << " - rCut: " << rCut << "Surf Normal spacing" << grid_spacing<<std::endl;
+  }
+  Sparticles.ghost_get<3>();
+  //Sparticles.write("SparticlesInit");
+  vector_dist_subset<3,double,aggregate<double,double,double[3],double,double[3],double>> Sparticles_bulk(Sparticles,0);
+  vector_dist_subset<3,double,aggregate<double,double,double[3],double,double[3],double>> Sparticles_boundary(Sparticles,1);
+  auto &bulkIds=Sparticles_bulk.getIds();
+  auto &bdrIds=Sparticles_boundary.getIds();
+  /*auto it2 = Sparticles.getDomainIterator();
+  while (it2.isNext()) {
+      auto p = it2.get();
+      Point<3, double> xP = Sparticles.getProp<4>(p);
+      *//*double Sum=0;
+      for(int m=-spL;m<=spL;++m)
+      {
+        Sum+=openfpm::math::Y(spL,m,xP[1],xP[2]);
+      }*//*
+      //Sparticles.getProp<ANADF>(p) = Sum;//openfpm::math::Y(K,K,xP[1],xP[2]);openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);;
+      Sparticles.getProp<3>(p)=openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<1>(p)=-(K)*(K+1)*openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      ++it2;
+  }*/
+  auto f=getV<3>(Sparticles);
+  auto Df=getV<0>(Sparticles);
+  //SurfaceDerivative_xx<2> Sdxx{Sparticles,2,rCut,grid_spacing_surf};
+  //SurfaceDerivative_yy<2> Sdyy{Sparticles,2,rCut,grid_spacing_surf};
+  //SurfaceDerivative_zz<2> Sdzz{Sparticles,2,rCut,grid_spacing_surf};
+  Derivative_xx Sdxx{Sparticles,2,rCut};
+  //std::cout<<"Dxx Done"<<std::endl;
+  Derivative_yy Sdyy{Sparticles,2,rCut};
+  //std::cout<<"Dyy Done"<<std::endl;
+  Derivative_zz Sdzz{Sparticles,2,rCut};
+  //std::cout<<"Dzz Done"<<std::endl;
+  //Laplace_Beltrami<2> SLap{Sparticles,2,rCut,grid_spacing_surf};
+  //SLap.DrawKernel<5>(Sparticles,73);
+  //Sdxx.DrawKernel<5>(Sparticles,0);
+  //Sdyy.DrawKernel<5>(Sparticles,0);
+  //Sdzz.DrawKernel<5>(Sparticles,0);
+/*  std::cout<<"SDXX:"<<std::endl;
+  Sdxx.checkMomenta(Sparticles);
+  std::cout<<"SDYY:"<<std::endl;
+  Sdyy.checkMomenta(Sparticles);
+  std::cout<<"SDZZ:"<<std::endl;
+  Sdzz.checkMomenta(Sparticles);*/
+  Sparticles.ghost_get<3>();
+  Df=(Sdxx(f)+Sdyy(f)+Sdzz(f));
+  //Df=SLap(f);
+  //auto it3 = Sparticles_bulk.getDomainIterator();
+  double worst = 0.0;
+  for (int j = 0; j < bulkIds.size(); j++) {
+      auto p = bulkIds.get<0>(j);
+      //Sparticles.getProp<5>(p) = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+        if (fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p)) > worst) {
+                  worst = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+              }
+  }
+        Sparticles.deleteGhost();
+        //Sparticles.write("SparticlesNoo");
+        //std::cout<<"Worst: "<<worst<<std::endl;
+        BOOST_REQUIRE(worst < 0.03);
+/*BOOST_AUTO_TEST_CASE(dcpse_surface_sphere_proj) {
+  auto & v_cl = create_vcluster();
+  timer tt;
+  tt.start();
+  size_t n=512;
+  size_t n_sp=n;
+  // Domain
+  double boxP1{-1.5}, boxP2{1.5};
+  double boxSize{boxP2 - boxP1};
+  size_t sz[3] = {n,n,n};
+  double grid_spacing{boxSize/(sz[0]-1)};
+  double grid_spacing_surf=grid_spacing*30;
+  double rCut{2.5 * grid_spacing_surf};
+  Box<3,double> domain{{boxP1,boxP1,boxP1},{boxP2,boxP2,boxP2}};
+  Ghost<3,double> ghost{rCut + grid_spacing/8.0};
+  constexpr int K = 1;
+  // particles
+  vector_dist_ws<3, double, aggregate<VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,double>> Sparticles(0, domain,bc,ghost);
+  // 1. particles on the Spherical surface
+  double Golden_angle=M_PI * (3.0 - sqrt(5.0));
+  if (v_cl.rank() == 0) {
+    //std::vector<Vector3f> data;
+    //GenerateSphere(1,data);
+    for(int i=1;i<n_sp;i++)
+        {
+            double y = 1.0 - (i /double(n_sp - 1.0)) * 2.0;
+            double radius = sqrt(1 - y * y);
+            double Golden_theta = Golden_angle * i;
+            double x = cos(Golden_theta) * radius;
+            double z = sin(Golden_theta) * radius;
+            Sparticles.add();
+            Sparticles.getLastPos()[0] = x;
+            Sparticles.getLastPos()[1] = y;
+            Sparticles.getLastPos()[2] = z;
+            double rm=sqrt(x*x+y*y+z*z);
+            Sparticles.getLastProp<2>()[0] = x/rm;
+            Sparticles.getLastProp<2>()[1] = y/rm;
+            Sparticles.getLastProp<2>()[2] = z/rm;
+            Sparticles.getLastProp<4>()[0] = 1.0 ;
+            Sparticles.getLastProp<4>()[1] = std::atan2(sqrt(x*x+y*y),z);
+            Sparticles.getLastProp<4>()[2] = std::atan2(y,x);
+            if(i<=2*(K)+1)
+            {Sparticles.getLastSubset(1);}
+            else
+            {Sparticles.getLastSubset(0);}
+        }
+    //std::cout << "n: " << n << " - grid spacing: " << grid_spacing << " - rCut: " << rCut << "Surf Normal spacing" << grid_spacing<<std::endl;
+  }
+  Sparticles.ghost_get<3>();
+  vector_dist_subset<3,double,aggregate<VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,double>> Sparticles_bulk(Sparticles,0);
+  vector_dist_subset<3,double,aggregate<VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,VectorS<3,double>,double>> Sparticles_boundary(Sparticles,1);
+  auto &bulkIds=Sparticles_bulk.getIds();
+  auto &bdrIds=Sparticles_boundary.getIds();
+  std::unordered_map<const lm,double,key_hash,key_equal> Alm;
+  //Setting max mode l_max
+  //Setting amplitudes to 1
+  for(int l=0;l<=K;l++){
+      for(int m=-l;m<=l;m++){
+          Alm[std::make_tuple(l,m)]=0;
+      }
+  }
+  Alm[std::make_tuple(1,0)]=1;
+  auto it2 = Sparticles.getDomainIterator();
+  while (it2.isNext()) {
+      auto p = it2.get();
+      Point<3, double> xP = Sparticles.getProp<4>(p);
+      *//*double Sum=0;
+      for(int m=-spL;m<=spL;++m)
+      {
+        Sum+=openfpm::math::Y(spL,m,xP[1],xP[2]);
+      }*//*
+      //Sparticles.getProp<ANADF>(p) = Sum;//openfpm::math::Y(K,K,xP[1],xP[2]);openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);;
+      Sparticles.getProp<3>(p)[0]=openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<3>(p)[1]=openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<3>(p)[2]=openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<1>(p)[0]=-(K)*(K+1)*openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<1>(p)[1]=-(K)*(K+1)*openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      Sparticles.getProp<1>(p)[2]=-(K)*(K+1)*openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Alm);
+      ++it2;
+  }
+  auto f=getV<3>(Sparticles);
+  auto Df=getV<0>(Sparticles);
+  SurfaceProjectedGradient<2> SGP{Sparticles,2,rCut,grid_spacing_surf};
+  Sparticles.ghost_get<3>();
+  Df=SGP(f);
+  //Df=SLap(f);
+  auto it3 = Sparticles.getDomainIterator();
+  double worst = 0.0;
+  while (it3.isNext()) {
+      auto p = it3.get();
+      //Sparticles.getProp<5>(p) = fabs(Sparticles.getProp<1>(p) - Sparticles.getProp<0>(p));
+      if (fabs(Sparticles.getProp<1>(p)[0] - Sparticles.getProp<0>(p)[0]) > worst) {
+          worst = fabs(Sparticles.getProp<1>(p)[0] - Sparticles.getProp<0>(p)[0]);
+      }
+      ++it3;
+  }
+        Sparticles.deleteGhost();
+        //Sparticles.write("Sparticles");
+        //std::cout<<worst;
+        BOOST_REQUIRE(worst < 0.03);
+ BOOST_AUTO_TEST_CASE(dcpse_surface_adaptive) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[3] = {81,81,1};
+        Box<3, double> box({0, 0,-5}, {0.5, 0.5,5});
+        size_t bc[3] = {NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<3, double> ghost(spacing * 3.1);
+        double rCut = 3.1  * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist<3, double, aggregate<double,double,double,double,double,double,double[3]>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            domain.getLastPos()[2] = 0;
+            ++it;
+        }
+        // Add multi res patch 1
+        {
+        const size_t sz2[3] = {40,40,1};
+        Box<3,double> bx({0.25 + it.getSpacing(0)/4.0,0.25 + it.getSpacing(0)/4.0,-0.5},{sz2[0]*it.getSpacing(0)/2.0 + 0.25 + it.getSpacing(0)/4.0, sz2[1]*it.getSpacing(0)/2.0 + 0.25 + it.getSpacing(0)/4.0,0.5});
+        openfpm::vector<size_t> rem;
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+        	auto k = it.get();
+        	Point<3,double> xp = domain.getPos(k);
+        	if (bx.isInside(xp) == true)
+        	{
+        		rem.add(k.getKey());
+        	}
+        	++it;
+        }
+        domain.remove(rem);
+        auto it2 = domain.getGridIterator(sz2);
+        while (it2.isNext()) {
+            domain.add();
+            auto key = it2.get();
+            double x = key.get(0) * spacing/2.0 + 0.25 + spacing/4.0;
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * spacing/2.0 + 0.25 + spacing/4.0;
+            domain.getLastPos()[1] = y;
+            domain.getLastPos()[2] = 0;
+            ++it2;
+        }
+        }
+        // Add multi res patch 2
+        {
+        const size_t sz2[3] = {40,40,1};
+        Box<3,double> bx({0.25 + 21.0*spacing/8.0,0.25 + 21.0*spacing/8.0,-5},{sz2[0]*spacing/4.0 + 0.25 + 21.0*spacing/8.0, sz2[1]*spacing/4.0 + 0.25 + 21*spacing/8.0,5});
+        openfpm::vector<size_t> rem;
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+        	auto k = it.get();
+        	Point<3,double> xp = domain.getPos(k);
+        	if (bx.isInside(xp) == true)
+        	{
+        		rem.add(k.getKey());
+        	}
+        	++it;
+        }
+        domain.remove(rem);
+        auto it2 = domain.getGridIterator(sz2);
+        while (it2.isNext()) {
+            domain.add();
+            auto key = it2.get();
+            double x = key.get(0) * spacing/4.0 + 0.25 + 21*spacing/8.0;
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * spacing/4.0 + 0.25 + 21*spacing/8.0;
+            domain.getLastPos()[1] = y;
+            domain.getLastPos()[2] = 0;
+            ++it2;
+        }
+        }
+        ///////////////////////
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        openfpm::vector<aggregate<int>> ref_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto DCPSE_sol=getV<5>(domain);
+        // Here fill me
+        Box<3, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0,-5},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0,5});
+        Box<3, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0,-5},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0,5});
+        Box<3, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0,-5},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0,5});
+        Box<3, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0,-5},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0,5});
+        openfpm::vector<Box<3, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<3, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<3, double> xp = domain.getPos(p);
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            }
+            domain.getProp<6>(p)[0] = 0;
+            domain.getProp<6>(p)[1] = 0;
+            domain.getProp<6>(p)[2] = 1;
+            ++it2;
+        }
+        domain.ghost_get<1,2,3>();
+        SurfaceDerivative_xx<6> Dxx(domain, 2, rCut,3.9,support_options::ADAPTIVE);
+/*        v=0;
+        auto itNNN=domain.getDomainIterator();
+        while(itNNN.isNext()){
+            auto p=itNNN.get().getKey();
+            Dxx.DrawKernel<0,decltype(domain)>(domain,p);
+            domain.write_frame("Kernel",p);
+            v=0;
+            ++itNNN;
+        }
+        //Dxx.DrawKernel<5,decltype(domain)>(domain,6161);
+        //domain.write_frame("Kernel",6161);
+        SurfaceDerivative_yy<6> Dyy(domain, 2, rCut,3.9,support_options::ADAPTIVE);
+        SurfaceDerivative_zz<6> Dzz(domain, 2, rCut,3.9,support_options::ADAPTIVE);
+        domain.ghost_get<2>();
+        sol=Dxx(anasol)+Dyy(anasol)+Dzz(anasol);
+        domain.ghost_get<5>();
+        double worst1 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<1>(p) - domain.getProp<2>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+        }
+        //std::cout << "Maximum Analytic Error: " << worst1 << std::endl;
+        //domain.ghost_get<4>();
+        //domain.write("Robin_anasol");
+        BOOST_REQUIRE(worst1 < 0.03);
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_surface_adaptive_load) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        const size_t sz[3] = {81,81,1};
+        Box<3, double> box({0, 0,-5}, {0.5, 0.5,5});
+        size_t bc[3] = {NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+        double spacing = box.getHigh(0) / (sz[0] - 1);
+        Ghost<3, double> ghost(spacing * 3.1);
+        double rCut = 3.1  * spacing;
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist<3, double, aggregate<double,double,double,double,double,double,double[3]>> domain(0, box, bc, ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            double x = key.get(0) * it.getSpacing(0);
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * it.getSpacing(1);
+            domain.getLastPos()[1] = y;
+            domain.getLastPos()[2] = 0;
+            ++it;
+        }
+        // Add multi res patch 1
+        {
+        const size_t sz2[3] = {40,40,1};
+        Box<3,double> bx({0.25 + it.getSpacing(0)/4.0,0.25 + it.getSpacing(0)/4.0,-0.5},{sz2[0]*it.getSpacing(0)/2.0 + 0.25 + it.getSpacing(0)/4.0, sz2[1]*it.getSpacing(0)/2.0 + 0.25 + it.getSpacing(0)/4.0,0.5});
+        openfpm::vector<size_t> rem;
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+        	auto k = it.get();
+        	Point<3,double> xp = domain.getPos(k);
+        	if (bx.isInside(xp) == true)
+        	{
+        		rem.add(k.getKey());
+        	}
+        	++it;
+        }
+        domain.remove(rem);
+        auto it2 = domain.getGridIterator(sz2);
+        while (it2.isNext()) {
+            domain.add();
+            auto key = it2.get();
+            double x = key.get(0) * spacing/2.0 + 0.25 + spacing/4.0;
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * spacing/2.0 + 0.25 + spacing/4.0;
+            domain.getLastPos()[1] = y;
+            domain.getLastPos()[2] = 0;
+            ++it2;
+        }
+        }
+        // Add multi res patch 2
+        {
+        const size_t sz2[3] = {40,40,1};
+        Box<3,double> bx({0.25 + 21.0*spacing/8.0,0.25 + 21.0*spacing/8.0,-5},{sz2[0]*spacing/4.0 + 0.25 + 21.0*spacing/8.0, sz2[1]*spacing/4.0 + 0.25 + 21*spacing/8.0,5});
+        openfpm::vector<size_t> rem;
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+        	auto k = it.get();
+        	Point<3,double> xp = domain.getPos(k);
+        	if (bx.isInside(xp) == true)
+        	{
+        		rem.add(k.getKey());
+        	}
+        	++it;
+        }
+        domain.remove(rem);
+        auto it2 = domain.getGridIterator(sz2);
+        while (it2.isNext()) {
+            domain.add();
+            auto key = it2.get();
+            double x = key.get(0) * spacing/4.0 + 0.25 + 21*spacing/8.0;
+            domain.getLastPos()[0] = x;
+            double y = key.get(1) * spacing/4.0 + 0.25 + 21*spacing/8.0;
+            domain.getLastPos()[1] = y;
+            domain.getLastPos()[2] = 0;
+            ++it2;
+        }
+        }
+        ///////////////////////
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> up_p;
+        openfpm::vector<aggregate<int>> dw_p;
+        openfpm::vector<aggregate<int>> l_p;
+        openfpm::vector<aggregate<int>> r_p;
+        openfpm::vector<aggregate<int>> ref_p;
+        auto v = getV<0>(domain);
+        auto RHS=getV<1>(domain);
+        auto sol = getV<2>(domain);
+        auto anasol = getV<3>(domain);
+        auto err = getV<4>(domain);
+        auto DCPSE_sol=getV<5>(domain);
+        // Here fill me
+        Box<3, double> up({box.getLow(0) - spacing / 2.0, box.getHigh(1) - spacing / 2.0,-5},
+                          {box.getHigh(0) + spacing / 2.0, box.getHigh(1) + spacing / 2.0,5});
+        Box<3, double> down({box.getLow(0) - spacing / 2.0, box.getLow(1) - spacing / 2.0,-5},
+                            {box.getHigh(0) + spacing / 2.0, box.getLow(1) + spacing / 2.0,5});
+        Box<3, double> left({box.getLow(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0,-5},
+                            {box.getLow(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0,5});
+        Box<3, double> right({box.getHigh(0) - spacing / 2.0, box.getLow(1) + spacing / 2.0,-5},
+                             {box.getHigh(0) + spacing / 2.0, box.getHigh(1) - spacing / 2.0,5});
+        openfpm::vector<Box<3, double>> boxes;
+        boxes.add(up);
+        boxes.add(down);
+        boxes.add(left);
+        boxes.add(right);
+        // Create a writer and write
+        VTKWriter<openfpm::vector<Box<3, double>>, VECTOR_BOX> vtk_box;
+        vtk_box.add(boxes);
+        //vtk_box.write("vtk_box.vtk");
+        auto it2 = domain.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<3, double> xp = domain.getPos(p);
+            if (up.isInside(xp) == true) {
+                up_p.add();
+                up_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) = -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (down.isInside(xp) == true) {
+                dw_p.add();
+                dw_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (left.isInside(xp) == true) {
+                l_p.add();
+                l_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else if (right.isInside(xp) == true) {
+                r_p.add();
+                r_p.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            } else {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                domain.getProp<1>(p) =  -2*M_PI*M_PI*sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+                domain.getProp<3>(p) = sin(M_PI*xp.get(0))*sin(M_PI*xp.get(1));
+            }
+            domain.getProp<6>(p)[0] = 0;
+            domain.getProp<6>(p)[1] = 0;
+            domain.getProp<6>(p)[2] = 1;
+            ++it2;
+        }
+        domain.ghost_get<1,2,3>();
+        SurfaceDerivative_xx<6> Dxx(domain, 2, rCut,3.9,support_options::LOAD);
+        SurfaceDerivative_yy<6> Dyy(domain, 2, rCut,3.9,support_options::LOAD);
+        SurfaceDerivative_zz<6> Dzz(domain, 2, rCut,3.9,support_options::LOAD);
+        Dxx.load(domain,"Sdxx_test");
+        Dyy.load(domain,"Sdyy_test");
+        Dzz.load(domain,"Sdzz_test");
+        domain.ghost_get<2>();
+        sol=Dxx(anasol)+Dyy(anasol)+Dzz(anasol);
+        domain.ghost_get<5>();
+        double worst1 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(domain.getProp<1>(p) - domain.getProp<2>(p)) >= worst1) {
+                worst1 = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+            }
+            domain.getProp<4>(p) = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+        }
+        //std::cout << "Maximum Analytic Error: " << worst1 << std::endl;
+        //domain.ghost_get<4>();
+        //domain.write("Robin_anasol");
+        BOOST_REQUIRE(worst1 < 0.03);
+    }
\ No newline at end of file
diff --git a/src/DCPSE/DCPSE_op/tests/ b/src/DCPSE/DCPSE_op/tests/
new file mode 100644
index 00000000..3018f5c2
--- /dev/null
+++ b/src/DCPSE/DCPSE_op/tests/
@@ -0,0 +1,616 @@
+ * DCPSE_op_test.cpp
+ *
+ *  Created on: May 15, 2020
+ *      Author: Abhinav Singh
+ *
+ */
+#include "config.h"
+#ifdef HAVE_EIGEN
+#ifdef HAVE_PETSC
+#include "util/util_debug.hpp"
+#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include "../DCPSE_op.hpp"
+#include "../DCPSE_Solver.hpp"
+#include "../DCPSE_Solver.cuh"
+#include "Operators/Vector/vector_dist_operators.hpp"
+#include "Vector/vector_dist_subset.hpp"
+#include "../EqnsStruct.hpp"
+//template<typename T>
+//struct Debug;
+#if 0
+    BOOST_AUTO_TEST_CASE(dcpse_op_subset_tests) {
+        size_t edgeSemiSize = 40;
+        const size_t sz[2] = {2 * edgeSemiSize, 2 * edgeSemiSize};
+        Box<2, double> box({0, 0}, {1.0,1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        int ord = 2;
+        double sampling_factor = 4.0;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+        vector_dist_ws_gpu<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double>> Particles(0, box,
+                                                                                                                 bc,
+                                                                                                                 ghost);
+        //Init_DCPSE(Particles)
+        BOOST_TEST_MESSAGE("Init Particles...");
+        std::mt19937 rng{6666666};
+        std::normal_distribution<> gaussian{0, sigma2};
+//        openfpm::vector<aggregate<int>> bulk;
+//        openfpm::vector<aggregate<int>> boundary;
+        auto it = Particles.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double x = k0 * spacing[0];
+            Particles.getLastPos()[0] = x;//+ gaussian(rng);
+            mem_id k1 = key.get(1);
+            double y = k1 * spacing[1];
+            Particles.getLastPos()[1] = y;//+gaussian(rng);
+            // Here fill the function value
+            Particles.template getLastProp<0>() = sin(Particles.getLastPos()[0]) + sin(Particles.getLastPos()[1]);
+            if (k0 != 0 && k1 != 0 && k0 != sz[0] -1 && k1 != sz[1] - 1)
+            {
+//              bulk.add();
+//              bulk.template get<0>(bulk.size()-1) = Particles.size_local() - 1;
+                Particles.getLastSubset(0);
+            }
+            else
+            {
+//                boundary.add();
+//                boundary.template get<0>(boundary.size()-1) = Particles.size_local() - 1;
+                Particles.getLastSubset(1);
+            }
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync Particles across processors...");
+        Particles.ghost_get<0>();
+        auto git = Particles.getGhostIterator();
+        while (git.isNext())
+        {
+            auto p = git.get();
+            Particles.template getProp<0>(p) = std::numeric_limits<double>::quiet_NaN();
+            ++git;
+        }
+        vector_dist_subset_gpu<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double>> Particles_bulk(Particles,0);
+        vector_dist_subset_gpu<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double>> Particles_boundary(Particles,1);
+        auto & boundary = Particles_boundary.getIds();
+        // move particles
+        auto P = getV<0>(Particles);
+        auto Out = getV<1>(Particles);
+        auto Pb = getV<2>(Particles);
+        auto Out_V = getV<3>(Particles);
+        auto P_bulk = getV<2>(Particles_bulk);
+        auto Out_bulk = getV<1>(Particles_bulk);
+        auto Out_V_bulk = getV<3>(Particles_bulk);
+        Out=10;
+        P_bulk = 5;
+        P_bulk=Pb+Out;
+//        Particles.write("Test_output_subset");
+        // Create the subset
+       /* Derivative_x Dx(Particles, 2, rCut);
+        Derivative_y Dy(Particles, 2, rCut);
+        Derivative_x Dx_bulk(Particles_bulk, 2, rCut);
+        Derivative_x_gpu Dx_bulk(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y_gpu Dy_bulk(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+        Out_bulk = Dx_bulk(P);
+        Out_V_bulk[0] = P + Dx_bulk(P);
+        Out_V_bulk[1] = Out_V[0] +Dy_bulk(P);
+    // Check
+        bool is_nan = false;
+        auto & v_cl = create_vcluster();
+        if (v_cl.size() > 1)
+        {
+            auto it2 = Particles_bulk.getDomainIterator();
+            while (it2.isNext())
+            {
+            auto p = it2.get();
+    /*      BOOST_REQUIRE_EQUAL(Particles_bulk.getProp<2>(p),15.0);
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<1>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.005 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[0] - Particles_bulk.getProp<0>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.001 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[1] - Particles_bulk.getProp<3>(p)[0] - cos(Particles_bulk.getPos(p)[1])) < 0.001 );*/
+                is_nan |= std::isnan(Particles_bulk.template getProp<1>(p));
+    //        Particles_bulk.template getProp<0>(p) = fabs(Particles_bulk.getProp<1>(p) - cos(Particles_bulk.getPos(p)[0]));
+                ++it2;
+            }
+            BOOST_REQUIRE_EQUAL(is_nan,true);
+        }
+//        P_bulk = Dx_bulk(P_bulk);  <------------ Incorrect produce error message
+//        P = Dx_bulk(P);   <------- Incorrect produce overflow
+        Particles.ghost_get<0>();
+        for (int i = 0 ; i < boundary.size() ; i++)
+        {
+            Particles.template getProp<0>(boundary.template get<0>(i)) = std::numeric_limits<double>::quiet_NaN();
+        }
+        Particles.ghost_get<0>();
+        Out_bulk = Dx_bulk(P);
+        Out_V_bulk[0] = P + Dx_bulk(P);
+        Out_V_bulk[1] = Out_V[0] +Dy_bulk(P);
+        auto it2 = Particles_bulk.getDomainIterator();
+        while (it2.isNext())
+        {
+            auto p = it2.get();
+            BOOST_REQUIRE_EQUAL(Particles_bulk.getProp<2>(p),15.0);
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<1>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.005 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[0] - Particles_bulk.getProp<0>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.001 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[1] - Particles_bulk.getProp<3>(p)[0] - cos(Particles_bulk.getPos(p)[1])) < 0.001 );
+            ++it2;
+        }
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_op_subset_PC_lid) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        constexpr int x = 0;
+        constexpr int y = 1;
+        size_t edgeSemiSize = 20;
+        const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
+        Box<2, double> box({0, 0}, {1.0,1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        int ord = 2;
+        double sampling_factor = 4.0;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+        auto &v_cl = create_vcluster();
+        typedef  aggregate<double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double,VectorS<2, double>,VectorS<2, double>,double> particle_type;
+        vector_dist_ws_gpu<2, double, particle_type> Particles(0, box,bc,ghost);
+        //Init_DCPSE(Particles)
+        BOOST_TEST_MESSAGE("Init Particles...");
+//        openfpm::vector<aggregate<int>> bulk;
+//        openfpm::vector<aggregate<int>> boundary;
+        auto it = Particles.getGridIterator(sz);
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double xp0 = k0 * spacing[0];
+            Particles.getLastPos()[0] = xp0;
+            mem_id k1 = key.get(1);
+            double yp0 = k1 * spacing[1];
+            Particles.getLastPos()[1] = yp0;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync Particles across processors...");
+        Particles.ghost_get<0>();
+        auto it2 = Particles.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = Particles.getPos(p);
+            if (xp[0] != 0 && xp[1] != 0 && xp[0] != 1.0 && xp[1] != 1.0) {
+//                bulk.add();
+//                bulk.last().get<0>() = p.getKey();
+                Particles.setSubset(p,0);
+                Particles.getProp<3>(p)[x] = 3.0;
+                Particles.getProp<3>(p)[y] = 3.0;
+            } else {
+//                boundary.add();
+//                boundary.last().get<0>() = p.getKey();
+                Particles.setSubset(p,1);
+                Particles.getProp<3>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+                Particles.getProp<3>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            }
+            Particles.getProp<6>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+            Particles.getProp<6>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            Particles.getProp<7>(p) = xp[0]+xp[1]-1.0;
+            ++it2;
+        }
+        vector_dist_subset_gpu<2, double, particle_type> Particles_bulk(Particles,0);
+        vector_dist_subset_gpu<2, double, particle_type> Particles_boundary(Particles,1);
+        auto & bulk = Particles_bulk.getIds();
+        auto & boundary = Particles_boundary.getIds();
+        auto P = getV<0>(Particles);
+        auto V = getV<1>(Particles);
+        auto RHS = getV<2>(Particles);
+        auto dV = getV<3>(Particles);
+        auto div = getV<4>(Particles);
+        auto V_star = getV<5>(Particles);
+        auto P_bulk = getV<0>(Particles_bulk);
+        auto RHS_bulk =getV<2>(Particles_bulk);
+        P_bulk = 0;
+        Derivative_x_gpu Dx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_xx_gpu Dxx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_yy_gpu Dyy(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y_gpu Dy(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_x_gpu Bulk_Dx(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y_gpu Bulk_Dy(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+        int n = 0, nmax = 5, ctr = 0, errctr=1, Vreset = 0;
+        double V_err=1;
+        if (Vreset == 1) {
+            P_bulk = 0;
+            P = 0;
+            Vreset = 0;
+        }
+        P=0;
+        eq_id vx,vy;
+        vx.setId(0);
+        vy.setId(1);
+        double sum, sum1, sum_k,V_err_eps=1e-3,V_err_old;
+        auto Stokes1=Dxx(V[x])+Dyy(V[x]);
+        auto Stokes2=Dxx(V[y])+Dyy(V[y]);
+        petsc_solver<double> solverPetsc;
+        //solverPetsc.setSolver(KSPGMRES);
+        //solverPetsc.setRestart(250);
+        //solverPetsc.setPreconditioner(PCJACOBI);
+        V_star=0;
+        RHS[x] = dV[x];
+        RHS[y] = dV[y];
+        while (V_err >= V_err_eps && n <= nmax) {
+            Particles.ghost_get<0>(SKIP_LABELLING);
+            RHS_bulk[x] = dV[x] + Bulk_Dx(P);
+            RHS_bulk[y] = dV[y] + Bulk_Dy(P);
+            DCPSE_scheme_gpu<equations2d2_gpu, decltype(Particles)> Solver(Particles);
+            Solver.impose(Stokes1, bulk, RHS[0], vx);
+            Solver.impose(Stokes2, bulk, RHS[1], vy);
+            Solver.impose(V[x], boundary, RHS[0], vx);
+            Solver.impose(V[y], boundary, RHS[1], vy);
+            /*auto A=Solver.getA(options_solver::STANDARD);
+            //A.getMatrixTriplets().save("Tripletes");
+            A.write("Mat_lid");*/
+            Solver.solve_with_solver(solverPetsc, V[x], V[y]);
+            Particles.ghost_get<1>(SKIP_LABELLING);
+            div = -(Dx(V[x]) + Dy(V[y]));
+            P_bulk = P + div;
+            sum = 0;
+            sum1 = 0;
+            for (int j = 0; j < bulk.size(); j++) {
+                auto p = bulk.get<0>(j);
+                sum += (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) *
+                       (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) +
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]) *
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]);
+                sum1 += Particles.getProp<1>(p)[0] * Particles.getProp<1>(p)[0] +
+                        Particles.getProp<1>(p)[1] * Particles.getProp<1>(p)[1];
+            }
+            sum = sqrt(sum);
+            sum1 = sqrt(sum1);
+            V_star = V;
+            v_cl.sum(sum);
+            v_cl.sum(sum1);
+            v_cl.execute();
+            V_err_old = V_err;
+            V_err = sum / sum1;
+            if (V_err > V_err_old || abs(V_err_old - V_err) < 1e-8) {
+                errctr++;
+                //alpha_P -= 0.1;
+            } else {
+                errctr = 0;
+            }
+            if (n > 3) {
+                if (errctr > 3) {
+                    std::cout << "CONVERGENCE LOOP BROKEN DUE TO INCREASE/VERY SLOW DECREASE IN ERROR" << std::endl;
+                    Vreset = 1;
+                    break;
+                } else {
+                    Vreset = 0;
+                }
+            }
+            n++;
+            if (v_cl.rank() == 0) {
+                std::cout << "Rel l2 cgs err in V = " << V_err << " at " << n << std::endl;
+            }
+        }
+        double worst1 = 0.0;
+        double worst2 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]) >= worst1) {
+                worst1 = fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]);
+            }
+        }
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]) >= worst2) {
+                worst2 = fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]);
+            }
+        }
+        //Particles.deleteGhost();
+        //Particles.write("PC_subset_lid");
+        std::cout << "Maximum Analytic Error in Vx: " << worst1 << std::endl;
+        std::cout << "Maximum Analytic Error in Vy: " << worst2 << std::endl;
+        BOOST_REQUIRE(worst1 < 0.03);
+        BOOST_REQUIRE(worst2 < 0.03);
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_op_subset_PC_lid2) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        constexpr int x = 0;
+        constexpr int y = 1;
+        size_t edgeSemiSize = 20;
+        const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
+        Box<2, double> box({0, 0}, {1.0,1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        int ord = 2;
+        double sampling_factor = 4.0;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        auto &v_cl = create_vcluster();
+        vector_dist<2, double, aggregate<double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double,VectorS<2, double>,VectorS<2, double>,double>> Particles(0, box,
+                                                                                                                                                 bc,
+                                                                                                                                                 ghost);
+        vector_dist<2, double, aggregate<double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double,VectorS<2, double>,VectorS<2, double>,double>> Particles_subset(Particles.getDecomposition(), 0);
+        //Init_DCPSE(Particles)
+        BOOST_TEST_MESSAGE("Init Particles...");
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> boundary;
+        auto it = Particles.getGridIterator(sz);
+        size_t pointId = 0;
+        double minNormOne = 999;
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double xp0 = k0 * spacing[0];
+            Particles.getLastPos()[0] = xp0;
+            mem_id k1 = key.get(1);
+            double yp0 = k1 * spacing[1];
+            Particles.getLastPos()[1] = yp0;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync Particles across processors...");
+        Particles.ghost_get<0>();
+        auto it2 = Particles.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = Particles.getPos(p);
+            if (xp[0] != 0 && xp[1] != 0 && xp[0] != 1.0 && xp[1] != 1.0) {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                Particles.getProp<3>(p)[x] = 3.0;
+                Particles.getProp<3>(p)[y] = 3.0;
+            } else {
+                boundary.add();
+                boundary.last().get<0>() = p.getKey();
+                Particles.getProp<3>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+                Particles.getProp<3>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            }
+            Particles.getProp<6>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+            Particles.getProp<6>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            Particles.getProp<7>(p) = xp[0]+xp[1]-1.0;
+            ++it2;
+        }
+        for (int i = 0; i < bulk.size(); i++) {
+            Particles_subset.add();
+            Particles_subset.getLastPos()[0] = Particles.getPos(bulk.template get<0>(i))[0];
+            Particles_subset.getLastPos()[1] = Particles.getPos(bulk.template get<0>(i))[1];
+        }
+        Particles_subset.ghost_get<0>();
+        auto P = getV<0>(Particles);
+        auto V = getV<1>(Particles);
+        auto RHS = getV<2>(Particles);
+        auto dV = getV<3>(Particles);
+        auto div = getV<4>(Particles);
+        auto V_star = getV<5>(Particles);
+        auto P_bulk = getV<0>(Particles_subset);
+        auto Grad_bulk= getV<2>(Particles_subset);
+        P_bulk = 0;
+        Derivative_x Dx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_x Bulk_Dx(Particles_subset, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_xx Dxx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_yy Dyy(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y Dy(Particles, 2, rCut,sampling_factor, support_options::RADIUS),Bulk_Dy(Particles_subset, 2, rCut,sampling_factor, support_options::RADIUS);;
+        int n = 0, nmax = 5, ctr = 0, errctr=0, Vreset = 0;
+        double V_err=1;
+        if (Vreset == 1) {
+            P_bulk = 0;
+            P = 0;
+            Vreset = 0;
+        }
+        P=0;
+        eq_id vx,vy;
+        vx.setId(0);
+        vy.setId(1);
+        double sum, sum1, sum_k,V_err_eps=1e-3,V_err_old;
+        auto Stokes1=Dxx(V[x])+Dyy(V[x]);
+        auto Stokes2=Dxx(V[y])+Dyy(V[y]);
+        petsc_solver<double> solverPetsc;
+        //solverPetsc.setSolver(KSPGMRES);
+        //solverPetsc.setRestart(250);
+        //solverPetsc.setPreconditioner(PCJACOBI);
+        V_star=0;
+        while (V_err >= V_err_eps && n <= nmax) {
+            RHS[x] = dV[x];
+            RHS[y] = dV[y];
+            Particles_subset.ghost_get<0>(SKIP_LABELLING);
+            Grad_bulk[x] = Bulk_Dx(P_bulk);
+            Grad_bulk[y] = Bulk_Dy(P_bulk);
+            for (int i = 0; i < bulk.size(); i++) {
+                Particles.template getProp<2>(bulk.template get<0>(i))[x] += Particles_subset.getProp<2>(i)[x];
+                Particles.template getProp<2>(bulk.template get<0>(i))[y] += Particles_subset.getProp<2>(i)[y];
+            }
+            DCPSE_scheme<equations2d2_gpu, decltype(Particles)> Solver(Particles);
+            Solver.impose(Stokes1, bulk, RHS[0], vx);
+            Solver.impose(Stokes2, bulk, RHS[1], vy);
+            Solver.impose(V[x], boundary, RHS[0], vx);
+            Solver.impose(V[y], boundary, RHS[1], vy);
+            Solver.solve_with_solver(solverPetsc, V[x], V[y]);
+            Particles.ghost_get<1>(SKIP_LABELLING);
+            div = -(Dx(V[x]) + Dy(V[y]));
+            P = P + div;
+            for (int i = 0; i < bulk.size(); i++) {
+                Particles_subset.getProp<0>(i) = Particles.template getProp<0>(bulk.template get<0>(i));
+            }
+            sum = 0;
+            sum1 = 0;
+            for (int j = 0; j < bulk.size(); j++) {
+                auto p = bulk.get<0>(j);
+                sum += (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) *
+                       (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) +
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]) *
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]);
+                sum1 += Particles.getProp<1>(p)[0] * Particles.getProp<1>(p)[0] +
+                        Particles.getProp<1>(p)[1] * Particles.getProp<1>(p)[1];
+            }
+            sum = sqrt(sum);
+            sum1 = sqrt(sum1);
+            V_star=V;
+            v_cl.sum(sum);
+            v_cl.sum(sum1);
+            v_cl.execute();
+            V_err_old = V_err;
+            V_err = sum / sum1;
+            if (V_err > V_err_old || abs(V_err_old - V_err) < 1e-8) {
+                errctr++;
+                //alpha_P -= 0.1;
+            } else {
+                errctr = 0;
+            }
+            if (n > 3) {
+                if (errctr > 3) {
+                    std::cout << "CONVERGENCE LOOP BROKEN DUE TO INCREASE/VERY SLOW DECREASE IN ERROR" << std::endl;
+                    Vreset = 1;
+                    break;
+                } else {
+                    Vreset = 0;
+                }
+            }
+            n++;
+            if (v_cl.rank() == 0) {
+            std::cout << "Rel l2 cgs err in V = " << V_err << " at " << n << std::endl;
+            }
+        }
+        double worst1 = 0.0;
+        double worst2 = 0.0;
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]) >= worst1) {
+                worst1 = fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]);
+            }
+        }
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]) >= worst2) {
+                worst2 = fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]);
+            }
+        }
+        std::cout << "Maximum Analytic Error in slice x: " << worst1 << std::endl;
+        std::cout << "Maximum Analytic Error in slice y: " << worst2 << std::endl;
+        BOOST_REQUIRE(worst1 < 0.03);
+        BOOST_REQUIRE(worst2 < 0.03);
+        //Particles.write("PC_subset_lid2");
+    }
+#endif //if 0
\ No newline at end of file
diff --git a/src/DCPSE/DCPSE_op/tests/DCPSE_op_test3d.cpp b/src/DCPSE/DCPSE_op/tests/DCPSE_op_test3d.cpp
index 29967137..0a28bc08 100644
--- a/src/DCPSE/DCPSE_op/tests/DCPSE_op_test3d.cpp
+++ b/src/DCPSE/DCPSE_op/tests/DCPSE_op_test3d.cpp
@@ -307,13 +307,13 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests3)
 //Is failing on Ubuntu CI with 5 cores. Needs investigation.
-/*    BOOST_AUTO_TEST_CASE(Sph_harm) {
+    BOOST_AUTO_TEST_CASE(Sph_harm) {
         //These would be a requirement once Boost releases their fix
         double nu=1.0;
-        size_t grd_sz=20;
+        size_t grd_sz=13;
         const size_t sz[3] = {grd_sz,grd_sz,grd_sz};
         Box<3, double> box({-1.0, -1.0,-1.0}, {1.0,1.0,1.0});
         size_t bc[3] = {NON_PERIODIC, NON_PERIODIC, NON_PERIODIC};
@@ -523,7 +523,7 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests3)
         int ctr = 0, errctr, Vreset = 0;
         V_err = 1;
         n = 0;
-        tt.start();
+        double solvetime=0;
         while (V_err >= V_err_eps && n <= nmax) {
@@ -540,7 +540,10 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests3)
             Solver.impose(V[0], Surface, V_B[0], vx);
             Solver.impose(V[1], Surface, V_B[1], vy);
             Solver.impose(V[2], Surface, V_B[2], vz);
+            tt.start();
             Solver.solve_with_solver(solverPetsc, V[0], V[1], V[2]);
+            tt.stop();
+            solvetime+=tt.getwct();
             //std::cout << "Stokes Solved" << std::endl;
@@ -586,7 +589,7 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests3)
+        //std::cout << "Total Solver time (wct):"<<solvetime<< std::endl;
         double worst=0;
@@ -613,19 +616,350 @@ BOOST_AUTO_TEST_SUITE(dcpse_op_suite_tests3)
-*//*        if (v_cl.rank() == 0) {
+        if (v_cl.rank() == 0) {
             std::cout<<"Gd,Surf,Bulk Size: "<<grd_sz<<","<<Surface.size()<<","<<bulk.size()<<std::endl;
             std::cout << "L2_Final: " <<sqrt(L2)<<","<<sqrt(L2/(bulk.size()+Surface.size()))
                       << std::endl;
             std::cout << "L_inf_Final: " << worst
                       << std::endl;
-        }*//*
+        }
         std::cout << "L_inf_Final_test: " << worst;
-        //Particles.write("StokesSphere");
+        Particles.write("StokesSphere");*/
-    }*/
+    }
+    BOOST_AUTO_TEST_CASE(Sph_harm_ig) {
+        BOOST_REQUIRE(openfpm::math::Y(2,1,0.5,0)+0.459674<0.00001);
+        //These would be a requirement once Boost releases their fix
+        //
+        //BOOST_REQUIRE(boost::math::legendre_p(0,-1,1)=?);
+        double nu=1.0;
+        size_t grd_sz=13;
+        const size_t sz[3] = {grd_sz,grd_sz,grd_sz};
+        Box<3, double> box({-1.0, -1.0,-1.0}, {1.0,1.0,1.0});
+        size_t bc[3] = {NON_PERIODIC, NON_PERIODIC, NON_PERIODIC};
+        double spacing = 2.0 / (sz[0] - 1);
+        double rCut = 3.9*spacing;
+        double R=1.0;
+        Ghost<3, double> ghost(rCut);
+        //                                  P        V                 v_B           RHS            V_t         P_anal              RHS2            Polar cord
+        vector_dist_ws<3, double, aggregate<double,VectorS<3, double>,VectorS<3, double>,double,VectorS<3, double>,double,double,VectorS<3, double>,VectorS<3, double>,VectorS<3, double>>> Particles(0, box, bc, ghost);
+        auto &v_cl = create_vcluster();
+//        openfpm::vector<aggregate<int>> bulk;
+//        openfpm::vector<aggregate<int>> Surface;
+        auto it = Particles.getGridIterator(sz);
+        while (it.isNext()) {
+            auto key = it.get();
+            double x = -1.0+key.get(0) * it.getSpacing(0);
+            double y = -1.0+key.get(1) * it.getSpacing(1);
+            double z = -1.0+key.get(2) * it.getSpacing(2);
+            double r=sqrt(x*x+y*y+z*z);
+            if (r<R-spacing/2.0) {
+                Particles.add();
+                Particles.getLastPos()[0] = x;
+                Particles.getLastPos()[1] = y;
+                Particles.getLastPos()[2] = z;
+                Particles.getLastProp<8>()[0] = r;
+                if (r==0){
+                    Particles.getLastProp<8>()[1] = 0.0;
+                }
+                else{
+                    Particles.getLastProp<8>()[1] = std::atan2(sqrt(x*x+y*y),z);
+                }
+                Particles.getLastProp<8>()[2] = std::atan2(y,x);
+            }
+            ++it;
+        }
+        int n_sp=int(grd_sz)*int(grd_sz)*3;
+        double Golden_angle=M_PI * (3.0 - sqrt(5.0));
+        for(int i=1;i<=n_sp;i++)
+        {
+            double y = 1.0 - (i /double(n_sp - 1.0)) * 2.0;
+            double radius = sqrt(1 - y * y);
+            double Golden_theta = Golden_angle * i;
+            double x = cos(Golden_theta) * radius;
+            double z = sin(Golden_theta) * radius;
+            if (acos(z)==0 || acos(z)==M_PI){
+                std::cout<<"Theta 0/Pi "<<std::endl;
+                continue;
+            }
+            Particles.add();
+            Particles.getLastPos()[0] = x;
+            Particles.getLastPos()[1] = y;
+            Particles.getLastPos()[2] = z;
+            Particles.getLastProp<8>()[0] = 1.0 ;
+            Particles.getLastProp<8>()[1] = std::atan2(sqrt(x*x+y*y),z);
+            Particles.getLastProp<8>()[2] = std::atan2(y,x);
+        }
+        Particles.ghost_get<0>();
+        std::unordered_map<const lm,double,key_hash,key_equal> Vr;
+        std::unordered_map<const lm,double,key_hash,key_equal> V1;
+        std::unordered_map<const lm,double,key_hash,key_equal> V2;
+        //Setting max mode l_max
+        constexpr int K = 2;
+        //Setting amplitudes to 0
+        for(int l=0;l<=K;l++){
+            for(int m=-l;m<=l;m++){
+                Vr[std::make_tuple(l,m)]=0.0;
+                V1[std::make_tuple(l,m)]=0.0;
+                V2[std::make_tuple(l,m)]=0.0;
+            }
+        }
+        //Setting some amplitude for boundary velocity
+        V1[std::make_tuple(1,0)]=1.0;
+        auto it2 = Particles.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<3, double> xp = Particles.getPos(p);
+            Point<3, double> xP = Particles.getProp<8>(p);
+            Particles.getProp<0>(p) =0;
+            if (xP[0]==1.0) {
+//                Surface.add();
+//                Surface.last().get<0>() = p.getKey();
+                Particles.getProp<0>(p) =  0;
+                std::vector<double> SVel;
+                SVel=openfpm::math::sumY<K>(xP[0],xP[1],xP[2],Vr,V1,V2);
+                double SP=openfpm::math::sumY_Scalar<K>(xP[0],xP[1],xP[2],Vr);
+                Particles.getProp<2>(p)[0] = SVel[0];
+                Particles.getProp<2>(p)[1] = SVel[1];
+                Particles.getProp<2>(p)[2] = SVel[2];
+                Particles.getProp<9>(p)[0] = SVel[0];
+                Particles.getProp<9>(p)[1] = SVel[1];
+                Particles.getProp<9>(p)[2] = SVel[2];
+                Particles.getProp<5>(p) = SP;
+                Particles.setSubset(p,1);
+            }
+            else {
+//                bulk.add();
+//                bulk.last().get<0>() = p.getKey();
+                Particles.setSubset(p,0);
+                Particles.getProp<0>(p) =  0;
+                Particles.getProp<1>(p)[0] =  0;
+                Particles.getProp<1>(p)[1] =  0;
+                Particles.getProp<1>(p)[2] =  0;
+            }
+            ++it2;
+        }
+        vector_dist_subset<3, double, aggregate<double,VectorS<3, double>,VectorS<3, double>,double,VectorS<3, double>,double,double,VectorS<3, double>,VectorS<3, double>,VectorS<3, double>>> Particles_bulk(Particles,0);
+        vector_dist_subset<3, double, aggregate<double,VectorS<3, double>,VectorS<3, double>,double,VectorS<3, double>,double,double,VectorS<3, double>,VectorS<3, double>,VectorS<3, double>>> Particles_surface(Particles,1);
+        auto & bulk = Particles_bulk.getIds();
+        auto & Surface = Particles_surface.getIds();
+        for (int j = 0; j < bulk.size(); j++) {
+            auto p = bulk.get<0>(j);
+            Point<3, double> xp = Particles.getPos(p);
+            Point<3, double> xP = Particles.getProp<8>(p);
+            std::unordered_map<const lm,double,key_hash,key_equal> Ur;
+            std::unordered_map<const lm,double,key_hash,key_equal> U2;
+            std::unordered_map<const lm,double,key_hash,key_equal> U1;
+            std::unordered_map<const lm,double,key_hash,key_equal> Plm;
+            for (int l = 0; l <= K; l++) {
+                for (int m = -l; m <= l; m++) {
+                    auto Er= Vr.find(std::make_tuple(l,m));
+                    auto E1= V1.find(std::make_tuple(l,m));
+                    auto E2= V2.find(std::make_tuple(l,m));
+                    std::vector<double> Sol=openfpm::math::sph_anasol_u(nu,l,m,Er->second,E1->second,E2->second,xP[0]);
+                    Ur[std::make_tuple(l,m)]=Sol[0];
+                    U1[std::make_tuple(l,m)]=Sol[1];
+                    U2[std::make_tuple(l,m)]=Sol[2];
+                    Plm[std::make_tuple(l,m)]=Sol[3];
+                }
+            }
+            if(fabs(xP[0])>=1e-5 && xP[1]>1e-5 && (M_PI-xP[1])>=1e-5)
+            {
+                std::vector<double> SVel = openfpm::math::sumY<K>(xP[0], xP[1], xP[2], Ur, U1, U2);
+                Particles.getProp<9>(p)[0] = SVel[0];
+                Particles.getProp<9>(p)[1] = SVel[1];
+                Particles.getProp<9>(p)[2] = SVel[2];
+                Particles.getProp<5>(p) = openfpm::math::sumY_Scalar<K>(xP[0], xP[1], xP[2], Plm);
+            }
+        }
+        auto P = getV<0>(Particles);
+        auto V = getV<1>(Particles);
+        auto V_B = getV<2>(Particles);
+        V.setVarId(0);
+        auto DIV = getV<3>(Particles);
+        auto V_t = getV<4>(Particles);
+        auto P_anal = getV<5>(Particles);
+        auto temp=getV<6>(Particles);
+        auto RHS = getV<7>(Particles);
+        auto P_bulk = getV<0>(Particles_bulk);
+        auto RHS_bulk = getV<7>(Particles_bulk);
+        auto V_anal = getV<9>(Particles);
+        V_t=V;
+        P=0;
+        P_bulk=0;
+        eq_id vx,vy,vz;
+        vx.setId(0);
+        vy.setId(1);
+        vz.setId(2);
+        double sampling=3.1;
+        double sampling2=1.9;
+        double rCut2=3.9*spacing;
+        Derivative_x Dx(Particles, 2, rCut,sampling, support_options::RADIUS),B_Dx(Particles_bulk, 2, rCut,sampling, support_options::RADIUS);
+        Derivative_y Dy(Particles, 2, rCut,sampling, support_options::RADIUS),B_Dy(Particles_bulk, 2, rCut,sampling, support_options::RADIUS);
+        Derivative_z Dz(Particles, 2, rCut,sampling, support_options::RADIUS),B_Dz(Particles_bulk, 2, rCut,sampling, support_options::RADIUS);
+        Derivative_xx Dxx(Particles, 2, rCut2,sampling2,support_options::RADIUS);
+        Derivative_yy Dyy(Particles, 2, rCut2,sampling2,support_options::RADIUS);
+        Derivative_zz Dzz(Particles, 2, rCut2,sampling2,support_options::RADIUS);
+        //std::cout << "DCPSE KERNELS DONE" << std::endl;
+        petsc_solver<double> solverPetsc;
+        solverPetsc.setPreconditioner(PCNONE);
+        timer tt;
+        double sum=0,sum1=0;
+        V_t=V;
+        double V_err_eps = 1e-5;
+        double V_err = 1, V_err_old;
+        int n = 0;
+        int nmax = 30;
+        int ctr = 0, errctr, Vreset = 0;
+        V_err = 1;
+        n = 0;
+        double solvetime=0;
+        while (V_err >= V_err_eps && n <= nmax) {
+            //Particles.write_frame("StokesSphere",n);
+            Particles.ghost_get<0>(SKIP_LABELLING);
+            RHS_bulk[0] = B_Dx(P);
+            RHS_bulk[1] = B_Dy(P);
+            RHS_bulk[2] = B_Dz(P);
+            DCPSE_scheme<equations3d3, decltype(Particles)> Solver(Particles);
+            auto Stokes1 = nu * (Dxx(V[0])+Dyy(V[0])+Dzz(V[0]));
+            auto Stokes2 = nu * (Dxx(V[1])+Dyy(V[1])+Dzz(V[1]));
+            auto Stokes3 = nu * (Dxx(V[2])+Dyy(V[2])+Dzz(V[2]));
+            Solver.impose(Stokes1, bulk, RHS[0], vx);
+            Solver.impose(Stokes2, bulk, RHS[1], vy);
+            Solver.impose(Stokes3, bulk, RHS[2], vz);
+            Solver.impose(V[0], Surface, V_B[0], vx);
+            Solver.impose(V[1], Surface, V_B[1], vy);
+            Solver.impose(V[2], Surface, V_B[2], vz);
+            Solver.impose_x_ig(bulk, V[0], vx);
+            Solver.impose_x_ig(bulk, V[1], vy);
+            Solver.impose_x_ig(bulk, V[2], vz);
+            Solver.impose_x_ig(Surface, V[0], vx);
+            Solver.impose_x_ig(Surface, V[1], vy);
+            Solver.impose_x_ig(Surface, V[2], vz);
+            tt.start();
+            Solver.solve_with_solver_ig(solverPetsc, V[0], V[1], V[2]);
+            tt.stop();
+            solvetime+=tt.getwct();
+            //Solver.solve(V[0],V[1],V[2]);
+            //std::cout << "Stokes Solved" << std::endl;
+            Particles.ghost_get<1>();
+            DIV = -(Dx(V[0])+Dy(V[1])+Dz(V[2]));
+            P_bulk = P + DIV;
+            sum = 0;
+            sum1 = 0;
+            for (int j = 0; j < bulk.size(); j++) {
+                auto p = bulk.get<0>(j);
+                sum += (Particles.getProp<4>(p)[0] - Particles.getProp<1>(p)[0]) *
+                       (Particles.getProp<4>(p)[0] - Particles.getProp<1>(p)[0]) +
+                       (Particles.getProp<4>(p)[1] - Particles.getProp<1>(p)[1]) *
+                       (Particles.getProp<4>(p)[1] - Particles.getProp<1>(p)[1]) +
+                       (Particles.getProp<4>(p)[2] - Particles.getProp<1>(p)[2]) *
+                       (Particles.getProp<4>(p)[2] - Particles.getProp<1>(p)[2]);
+                sum1 += Particles.getProp<1>(p)[0] * Particles.getProp<1>(p)[0] +
+                        Particles.getProp<1>(p)[1] * Particles.getProp<1>(p)[1] +
+                        Particles.getProp<1>(p)[2] * Particles.getProp<1>(p)[2];
+            }
+            sum = sqrt(sum);
+            sum1 = sqrt(sum1);
+            v_cl.sum(sum);
+            v_cl.sum(sum1);
+            v_cl.execute();
+            V_t = V;
+            Particles.ghost_get<1>(SKIP_LABELLING);
+            V_err_old = V_err;
+            V_err = sum / sum1;
+            if (V_err > V_err_old || abs(V_err_old - V_err) < 1e-14) {
+                errctr++;
+            } else {
+                errctr = 0;
+            }
+            if (n > 3) {
+                if (errctr > 1) {
+                    std::cout << "CONVERGENCE LOOP BROKEN DUE TO INCREASE/VERY SLOW DECREASE IN ERROR" << std::endl;
+                    Vreset = 1;
+                    break;
+                } else {
+                    Vreset = 0;
+                }
+            }
+            n++;
+        }
+        //std::cout << "Total Solver time (wct):"<<solvetime<< std::endl;
+        V_t=0;
+        double worst=0;
+        double L2=0;
+        for (int j = 0; j < bulk.size(); j++) {
+            auto p = bulk.get<0>(j);
+            Point<3,double> xP=Particles.getProp<8>(p);
+            if(xP[0]>=1e-5 && xP[1]>1e-5 && (M_PI-xP[1])>=1e-5)
+            {
+                double dx=Particles.getProp<9>(p)[0] - Particles.getProp<1>(p)[0];
+                double dy=Particles.getProp<9>(p)[1] - Particles.getProp<1>(p)[1];
+                double dz=Particles.getProp<9>(p)[2] - Particles.getProp<1>(p)[2];
+                Particles.getProp<4>(p)[0]=fabs(dx);
+                Particles.getProp<4>(p)[1]=fabs(dy);
+                Particles.getProp<4>(p)[2]=fabs(dz);
+                L2 += dx*dx+dy*dy+dz*dz;
+                if (std::max({fabs(dx),fabs(dy),fabs(dz)}) > worst) {
+                    worst = std::max({fabs(dx),fabs(dy),fabs(dz)});
+                }
+            }
+        }
+        v_cl.sum(worst);
+        v_cl.sum(L2);
+        v_cl.execute();
+       /* if (v_cl.rank() == 0) {
+            std::cout<<"Gd,Surf,Bulk Size: "<<grd_sz<<","<<Surface.size()<<","<<bulk.size()<<std::endl;
+            std::cout << "L2_Final: " <<sqrt(L2)<<","<<sqrt(L2/(bulk.size()+Surface.size()))
+                      << std::endl;
+            std::cout << "L_inf_Final: " << worst
+                      << std::endl;
+        }
+        std::cout << "L_inf_Final_test: " << worst;*/
+        //Particles.write("StokesSphere");
+        BOOST_REQUIRE(worst<1e-3);
+    }
diff --git a/src/DCPSE/DCPSE_op/tests/DCPSE_op_test_base_tests.cpp b/src/DCPSE/DCPSE_op/tests/DCPSE_op_test_base_tests.cpp
index 49ba1367..47286818 100644
--- a/src/DCPSE/DCPSE_op/tests/DCPSE_op_test_base_tests.cpp
+++ b/src/DCPSE/DCPSE_op/tests/DCPSE_op_test_base_tests.cpp
@@ -23,6 +23,7 @@
 #include "Operators/Vector/vector_dist_operators.hpp"
 #include "Vector/vector_dist_subset.hpp"
 #include "../EqnsStruct.hpp"
+#include "DCPSE/DcpseInterpolation.hpp"
 BOOST_AUTO_TEST_CASE(dcpse_op_tests) {
@@ -60,7 +61,7 @@ BOOST_AUTO_TEST_CASE(dcpse_op_tests) {
             domain.getLastPos()[1] = y;//+gaussian(rng);
             // Here fill the function value
             domain.template getLastProp<0>() = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]);
-            domain.template getLastProp<2>() = cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]);
+            domain.template getLastProp<2>() = 2*cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]);
@@ -68,7 +69,6 @@ BOOST_AUTO_TEST_CASE(dcpse_op_tests) {
         Derivative_x Dx(domain, 2, rCut);
         Derivative_y Dy(domain, 2, rCut);
         Gradient Grad(domain, 2, rCut);
@@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE(dcpse_op_tests) {
         auto v = getV<1>(domain);
         auto P = getV<0>(domain);
-        v = Dx(P) + Dy(P);
+        v = 2*Dx(P) + Dy(P);
         auto it2 = domain.getDomainIterator();
         double worst = 0.0;
@@ -96,6 +96,288 @@ BOOST_AUTO_TEST_CASE(dcpse_op_tests) {
+    BOOST_AUTO_TEST_CASE(dcpse_op_save_load) {
+        size_t edgeSemiSize = 40;
+        const size_t sz[2] = {2 * edgeSemiSize, 2 * edgeSemiSize};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 * M_PI / (sz[0] - 1);
+        spacing[1] = 2 * M_PI / (sz[1] - 1);
+        Ghost<2, double> ghost(spacing[0] * 3.9);
+        double rCut = 3.9 * spacing[0];
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+        vector_dist<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>>> domain(0, box,
+                                                                                                                 bc,
+                                                                                                                 ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double x = k0 * spacing[0];
+            domain.getLastPos()[0] = x;//+ gaussian(rng);
+            mem_id k1 = key.get(1);
+            double y = k1 * spacing[1];
+            domain.getLastPos()[1] = y;//+gaussian(rng);
+            // Here fill the function value
+            domain.template getLastProp<0>() = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]);
+            domain.template getLastProp<2>() = 2*cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]);
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        Derivative_x Dx(domain, 2, rCut);
+        Derivative_y Dy(domain, 2, rCut);
+        auto v = getV<1>(domain);
+        auto v2 = getV<3>(domain);
+        auto P = getV<0>(domain);
+        v2 = 2*Dx(P) + Dy(P);
+        Derivative_x DxLoaded(domain, 2, rCut,1,support_options::LOAD);
+        Derivative_y DyLoaded(domain, 2, rCut,1,support_options::LOAD);
+        DxLoaded.load(domain,"DX_test");
+        DyLoaded.load(domain,"DY_test");
+        v= 2*DxLoaded(P)+DyLoaded(P);
+        auto it2 = domain.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(domain.getProp<1>(p) - domain.getProp<2>(p)) > worst) {
+                worst = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+            }
+            ++it2;
+        }
+        domain.deleteGhost();
+        //std::cout<<worst;
+        BOOST_REQUIRE(worst < 0.03);
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_op_save_load2) {
+        size_t edgeSemiSize = 40;
+        const size_t sz[2] = {2 * edgeSemiSize, 2 * edgeSemiSize};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 * M_PI / (sz[0] - 1);
+        spacing[1] = 2 * M_PI / (sz[1] - 1);
+        Ghost<2, double> ghost(spacing[0] * 3.9);
+        double rCut = 3.9 * spacing[0];
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+        vector_dist<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>>> domain(0, box,
+                                                                                                                 bc,
+                                                                                                                 ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double x = k0 * spacing[0];
+            domain.getLastPos()[0] = x;//+ gaussian(rng);
+            mem_id k1 = key.get(1);
+            double y = k1 * spacing[1];
+            domain.getLastPos()[1] = y;//+gaussian(rng);
+            // Here fill the function value
+            domain.template getLastProp<0>() = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]);
+            domain.template getLastProp<2>() = 2*cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]);
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        auto v = getV<1>(domain);
+        auto v2 = getV<3>(domain);
+        auto P = getV<0>(domain);
+        Derivative_x DxLoaded(domain, 2, rCut,1,support_options::LOAD);
+        Derivative_y DyLoaded(domain, 2, rCut,1,support_options::LOAD);
+        DxLoaded.load(domain,"DX_test");
+        DyLoaded.load(domain,"DY_test");
+        v= 2*DxLoaded(P)+DyLoaded(P);
+        auto it2 = domain.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(domain.getProp<1>(p) - domain.getProp<2>(p)) > worst) {
+                worst = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+            }
+            ++it2;
+        }
+        domain.deleteGhost();
+        //std::cout<<worst;
+        BOOST_REQUIRE(worst < 0.03);
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_op_tests_fa) {
+        size_t edgeSemiSize = 40;
+        const size_t sz[2] = {2 * edgeSemiSize, 2 * edgeSemiSize};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 * M_PI / (sz[0] - 1);
+        spacing[1] = 2 * M_PI / (sz[1] - 1);
+        Ghost<2, double> ghost(spacing[0] * 3.9);
+        double rCut = 3.9 * spacing[0];
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+        typedef vector_dist<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>>> vector_type;
+        vector_type domain(0, box,bc,ghost);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext()) {
+            domain.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double x = k0 * spacing[0];
+            domain.getLastPos()[0] = x;//+ gaussian(rng);
+            mem_id k1 = key.get(1);
+            double y = k1 * spacing[1];
+            domain.getLastPos()[1] = y;//+gaussian(rng);
+            // Here fill the function value
+            domain.template getLastProp<0>() = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]);
+            domain.template getLastProp<2>() = cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]);
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        PPInterpolation<vector_type,vector_type> Fx(domain,domain, 2, rCut);
+        auto v = getV<1>(domain);
+        auto P = getV<0>(domain);
+        Fx.p2p<0,1>();
+        auto it2 = domain.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(domain.getProp<1>(p) - domain.getProp<0>(p)) > worst) {
+                worst = fabs(domain.getProp<1>(p) - domain.getProp<0>(p));
+            }
+            ++it2;
+        }
+        //std::cout<<"Worst:"<<worst<<std::endl;
+        domain.deleteGhost();
+        //domain.write_frame("test",0,0.024,BINARY);
+        BOOST_REQUIRE(worst < 0.03);
+    }
+    BOOST_AUTO_TEST_CASE(dcpse_op_tests_mfa) {
+        size_t edgeSemiSize = 40;
+        const size_t sz[2] = {2 * edgeSemiSize, 2 * edgeSemiSize};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 * M_PI / (sz[0] - 1);
+        spacing[1] = 2 * M_PI / (sz[1] - 1);
+        Ghost<2, double> ghost(spacing[0] * 3.9);
+        double rCut = 3.9 * spacing[0];
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / ( 4);
+        std::normal_distribution<> gaussian{0, sigma2};
+        std::mt19937 rng{6666666};
+        typedef vector_dist<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>>> vector_dist;
+        vector_dist domain(0, box,bc,ghost);
+        vector_dist domain2(domain.getDecomposition(),0);
+        //Init_DCPSE(domain)
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext()) {
+            domain.add();
+            domain2.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            mem_id k1 = key.get(1);
+            double x = k0 * spacing[0];
+            double y = k1 * spacing[1];
+            domain.getLastPos()[0] = x;//+ gaussian(rng);
+            domain.getLastPos()[1] = y;//+gaussian(rng);
+            if(x!=0 && y!=0 && x!=box.getHigh(0) && y!=box.getHigh(1)){
+                domain2.getLastPos()[0] = x+ gaussian(rng);
+                domain2.getLastPos()[1] = y+ gaussian(rng);
+            }
+            else{
+                domain2.getLastPos()[0] = x;
+                domain2.getLastPos()[1] = y;
+            }
+            // Here fill the function value
+            domain.template getLastProp<0>() = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]);
+            domain.template getLastProp<1>() = 0.0;
+            domain2.template getLastProp<0>() = sin(domain2.getLastPos()[0]) + sin(domain2.getLastPos()[1]);
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync domain across processors...");
+        domain.ghost_get<0>();
+        domain2.ghost_get<0>();
+        PPInterpolation<vector_dist,vector_dist> Fx(domain2,domain, 2, rCut);
+        //auto v = getV<1>(domain);
+        //auto P = getV<0>(domain);
+        Fx.p2p<0,1>();
+        auto it2 = domain.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            //domain.template getProp<2>(p) = domain.getProp<1>(p) - domain.getProp<0>(p);
+            if (fabs(domain.getProp<1>(p) - domain.getProp<0>(p)) > worst) {
+                worst = fabs(domain.getProp<1>(p) - domain.getProp<0>(p));
+            }
+            ++it2;
+        }
+        //std::cout<<"Worst:"<<worst<<std::endl;
+        domain.deleteGhost();
+        //domain.write("test1");
+        //domain2.write("test2");
+        BOOST_REQUIRE(worst < 0.03);
+    }
     BOOST_AUTO_TEST_CASE(dcpse_op_test_lap) {
         size_t edgeSemiSize = 81;
@@ -209,7 +491,7 @@ BOOST_AUTO_TEST_CASE(dcpse_op_tests) {
             domain.getLastPos()[0] = x;//+ gaussian(rng);
             mem_id k1 = key.get(1);
             double y = k1 * spacing[1];
-            domain.getLastPos()[1] = y;//+gaussian(rng);
+                domain.getLastPos()[1] = y;//+gaussian(rng);
             // Here fill the function value
             domain.template getLastProp<1>()[0] = sin(domain.getLastPos()[0]) + sin(domain.getLastPos()[1]);
             domain.template getLastProp<1>()[1] = cos(domain.getLastPos()[0]) + cos(domain.getLastPos()[1]);
diff --git a/src/DCPSE/Dcpse.cuh b/src/DCPSE/Dcpse.cuh
new file mode 100644
index 00000000..637f995b
--- /dev/null
+++ b/src/DCPSE/Dcpse.cuh
@@ -0,0 +1,1031 @@
+// Created by Serhii
+#if defined(__NVCC__) && defined(HAVE_EIGEN)
+#include "Vector/vector_dist.hpp"
+#include "MonomialBasis.hpp"
+#include "DMatrix/EMatrix.hpp"
+#include "SupportBuilder.hpp"
+#include "SupportBuilder.cuh"
+#include "Support.hpp"
+#include "Vandermonde.hpp"
+#include "DcpseDiagonalScalingMatrix.hpp"
+#include "DcpseRhs.hpp"
+#include <chrono>
+// CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+template<unsigned int dim, typename particles_type, typename T, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename calcKernels_type>
+__global__ void calcKernels_gpu(particles_type, monomialBasis_type, supportKey_type, supportKey_type, T**, localEps_type, size_t, calcKernels_type);
+template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
+__global__ void assembleLocalMatrices_gpu( particles_type, Point<dim, unsigned int>, unsigned int, monomialBasis_type, supportKey_type, supportKey_type, supportKey_type,
+    T**, T**, localEps_type, localEps_type, matrix_type, size_t, size_t);
+template<unsigned int dim, typename vector_type, class T = typename vector_type::stype>
+class Dcpse_gpu {
+    static_assert(std::is_floating_point<T>::value, "CUBLAS supports only float or double");
+    typedef typename vector_type::value_type part_type;
+    typedef vector_type vtype;
+    #ifdef SE_CLASS1
+    int update_ctr=0;
+    #endif
+    // This works in this way:
+    // 1) User constructs this by giving a domain of points (where one of the properties is the value of our f),
+    //    the signature of the differential operator and the error order bound.
+    // 2) The machinery for assembling and solving the linear system for coefficients starts...
+    // 3) The user can then call an evaluate(point) method to get the evaluation of the differential operator
+    //    on the given point.
+    const Point<dim, unsigned int> differentialSignature;
+    const unsigned int differentialOrder;
+    MonomialBasis<dim> monomialBasis;
+    // shared local support previosly built by another operator
+    bool isSharedSupport = false;
+    openfpm::vector_custd<size_t> supportRefs; // Each MPI rank has just access to the local ones
+    openfpm::vector_custd<size_t> kerOffsets;
+    openfpm::vector_custd<size_t> supportKeys1D;
+    openfpm::vector_custd<T> localEps; // Each MPI rank has just access to the local ones
+    openfpm::vector_custd<T> localEpsInvPow; // Each MPI rank has just access to the local ones
+    openfpm::vector_custd<T> calcKernels;
+    openfpm::vector<size_t> subsetKeyPid;
+    vector_type & particles;
+    double rCut;
+    unsigned int convergenceOrder;
+    double supportSizeFactor;
+    size_t maxSupportSize;
+    size_t supportKeysTotalN;
+    support_options opt;
+#ifdef SE_CLASS1
+    int getUpdateCtr() const
+    {
+        return update_ctr;
+    }
+    // Here we require the first element of the aggregate to be:
+    // 1) the value of the function f on the point
+    Dcpse_gpu(vector_type &particles,
+          Point<dim, unsigned int> differentialSignature,
+          unsigned int convergenceOrder,
+          T rCut,
+          T supportSizeFactor = 1,
+          support_options opt = support_options::N_PARTICLES)
+        :particles(particles),
+            differentialSignature(differentialSignature),
+            differentialOrder(Monomial<dim>(differentialSignature).order()),
+            monomialBasis(differentialSignature.asArray(), convergenceOrder),
+            maxSupportSize(0),
+            supportKeysTotalN(0),
+            opt(opt)
+    {
+        particles.ghost_get_subset();
+        if (supportSizeFactor < 1) 
+            initializeAdaptive(particles, convergenceOrder, rCut);
+        else 
+            initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
+    }
+    Dcpse_gpu(vector_type &particles,
+          const Dcpse_gpu<dim, vector_type, T>& other,
+          Point<dim, unsigned int> differentialSignature,
+          unsigned int convergenceOrder,
+          T rCut,
+          T supportSizeFactor = 1,
+          support_options opt = support_options::N_PARTICLES)
+        :particles(particles), opt(opt),
+            differentialSignature(differentialSignature),
+            differentialOrder(Monomial<dim>(differentialSignature).order()),
+            monomialBasis(differentialSignature.asArray(), convergenceOrder),
+            subsetKeyPid(other.subsetKeyPid),
+            supportRefs(other.supportRefs),
+            supportKeys1D(other.supportKeys1D),
+            kerOffsets(other.kerOffsets),
+            maxSupportSize(other.maxSupportSize),
+            supportKeysTotalN(other.supportKeysTotalN),
+            isSharedSupport(true)
+    {
+        particles.ghost_get_subset();
+        if (supportSizeFactor < 1)
+            initializeAdaptive(particles, convergenceOrder, rCut);
+        else
+            initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
+    }
+    template<unsigned int prp>
+    void DrawKernel(vector_type &particles, int k)
+    {
+        size_t xpK = k;
+        size_t kerOff = kerOffsets.get(k);
+        size_t  supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
+        for (int i = 0; i < supportKeysSize; i++)
+        {
+            size_t xqK = supportKeys[i];
+            particles.template getProp<prp>(xqK) += calcKernels.get(kerOff+i);
+        }
+    }
+    template<unsigned int prp>
+    void DrawKernelNN(vector_type &particles, int k)
+    {
+        size_t xpK = k;
+        size_t kerOff = kerOffsets.get(k);
+        size_t  supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
+        for (int i = 0; i < supportKeysSize; i++)
+        {
+            size_t xqK = supportKeys[i];
+            particles.template getProp<prp>(xqK) = 1.0;
+        }
+    }
+    template<unsigned int prp>
+    void DrawKernel(vector_type &particles, int k, int i)
+    {
+        size_t xpK = k;
+        size_t kerOff = kerOffsets.get(k);
+        size_t  supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
+        for (int i = 0; i < supportKeysSize; i++)
+        {
+            size_t xqK = supportKeys[i];
+            particles.template getProp<prp>(xqK)[i] += calcKernels.get(kerOff+i);
+        }
+    }
+    void checkMomenta(vector_type &particles)
+    {
+        openfpm::vector<aggregate<double,double>> momenta;
+        openfpm::vector<aggregate<double,double>> momenta_accu;
+        momenta.resize(monomialBasis.size());
+        momenta_accu.resize(monomialBasis.size());
+        for (int i = 0; i < momenta.size(); i++)
+        {
+            momenta.template get<0>(i) =  3000000000.0;
+            momenta.template get<1>(i) = -3000000000.0;
+        }
+        size_t N = particles.size_local();
+        for (size_t j = 0; j < N; ++j)
+        {
+            double eps = localEps.get(j);
+            for (int i = 0; i < momenta.size(); i++)
+            {
+                momenta_accu.template get<0>(i) =  0.0;
+            }
+            size_t xpK = supportRefs.get(j);
+            Point<dim, T> xp = particles.getPos(xpK);
+            size_t kerOff = kerOffsets.get(xpK);
+            size_t  supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
+            size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
+            for (int i = 0; i < supportKeysSize; i++)
+            {
+                size_t xqK = supportKeys[i];
+                Point<dim, T> xq = particles.getPos(xqK);
+                Point<dim, T> normalizedArg = (xp - xq) / eps;
+                auto ker = calcKernels.get(kerOff+i);
+                int counter = 0;
+                size_t N = monomialBasis.getElements().size();
+                for (size_t i = 0; i < N; ++i)
+                {
+                    const Monomial<dim> &m = monomialBasis.getElement(i);
+                    T mbValue = m.evaluate(normalizedArg);
+                    momenta_accu.template get<0>(counter) += mbValue * ker;
+                    ++counter;
+                }
+            }
+            for (int i = 0; i < momenta.size(); i++)
+            {
+                if (momenta_accu.template get<0>(i) < momenta.template get<0>(i))
+                {
+                    momenta.template get<0>(i) = momenta_accu.template get<0>(i);
+                }
+                if (momenta_accu.template get<1>(i) > momenta.template get<1>(i))
+                {
+                    momenta.template get<1>(i) = momenta_accu.template get<0>(i);
+                }
+            }
+        }
+        for (int i = 0; i < momenta.size(); i++)
+        {
+            std::cout << "MOMENTA: " << monomialBasis.getElements()[i] << "Min: " << momenta.template get<0>(i) << "  " << "Max: " << momenta.template get<1>(i) << std::endl;
+        }
+    }
+    /**
+     * Computes the value of the differential operator on all the particles,
+     * using the f values stored at the fValuePos position in the aggregate
+     * and storing the resulting Df values at the DfValuePos position in the aggregate.
+     * @tparam fValuePos Position in the aggregate of the f values to use.
+     * @tparam DfValuePos Position in the aggregate of the Df values to store.
+     * @param particles The set of particles to iterate over.
+     */
+    template<unsigned int fValuePos, unsigned int DfValuePos>
+    void computeDifferentialOperator(vector_type &particles) {
+        char sign = 1;
+        if (differentialOrder % 2 == 0) {
+            sign = -1;
+        }
+        size_t N = particles.size_local();
+        for (size_t j = 0; j < N; ++j)
+        {
+            double epsInvPow = localEpsInvPow.get(j);
+            T Dfxp = 0;
+            size_t xpK = supportRefs.get(j);
+            Point<dim, typename vector_type::stype> xp = particles.getPos(xpK);
+            T fxp = sign * particles.template getProp<fValuePos>(xpK);
+            size_t kerOff = kerOffsets.get(xpK);
+            size_t  supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
+            size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
+            for (int i = 0; i < supportKeysSize; i++)
+            {
+                size_t xqK = supportKeys[i];
+                T fxq = particles.template getProp<fValuePos>(xqK);
+                Dfxp += (fxq + fxp) * calcKernels.get(kerOff+i);
+            }
+            Dfxp *= epsInvPow;
+            particles.template getProp<DfValuePos>(xpK) = Dfxp;
+        }
+    }
+    /*! \brief Get the number of neighbours
+     *
+     * \return the number of neighbours
+     *
+     */
+    inline int getNumNN(const vect_dist_key_dx &key)
+    {
+        return kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
+    }
+    /*! \brief Get the coefficent j (Neighbour) of the particle key
+     *
+     * \param key particle
+     * \param j neighbour
+     *
+     * \return the coefficent
+     *
+     */
+    inline T getCoeffNN(const vect_dist_key_dx &key, int j)
+    {
+        size_t base = kerOffsets.get(key.getKey());
+        return calcKernels.get(base + j);
+    }
+    /*! \brief Get the number of neighbours
+     *
+     * \return the number of neighbours
+     *
+     */
+    inline size_t getIndexNN(const vect_dist_key_dx &key, int j)
+    {
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
+        return supportKeys[j];
+    }
+    inline T getSign()
+    {
+        T sign = 1.0;
+        if (differentialOrder % 2 == 0) {
+            sign = -1;
+        }
+        return sign;
+    }
+    T getEpsilonInvPrefactor(const vect_dist_key_dx &key)
+    {
+        return localEpsInvPow.get(key.getKey());
+    }
+    /**
+     * Computes the value of the differential operator for one particle for o1 representing a scalar
+     *
+     * \param key particle
+     * \param o1 source property
+     * \return the selected derivative
+     *
+     */
+    template<typename op_type>
+    auto computeDifferentialOperator(const vect_dist_key_dx &key,
+                                     op_type &o1) -> decltype(is_scalar<std::is_fundamental<decltype(o1.value(
+            key))>::value>::analyze(key, o1)) {
+        typedef decltype(is_scalar<std::is_fundamental<decltype(o1.value(key))>::value>::analyze(key, o1)) expr_type;
+        T sign = 1.0;
+        if (differentialOrder % 2 == 0) {
+            sign = -1;
+        }
+        size_t localKey = subsetKeyPid.get(key.getKey());
+        double eps = localEps.get(localKey);
+        double epsInvPow = localEpsInvPow.get(localKey);
+        auto &particles = o1.getVector();
+#ifdef SE_CLASS1
+        if(particles.getMapCtr()!=this->getUpdateCtr())
+        {
+            std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a DCPSE operator update after map."<<std::endl;
+        }
+        expr_type Dfxp = 0;
+        size_t xpK = supportRefs.get(localKey);
+        Point<dim, T> xp = particles.getPos(xpK);
+        expr_type fxp = sign * o1.value(key);
+        size_t kerOff = kerOffsets.get(xpK);
+        size_t  supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
+        for (int i = 0; i < supportKeysSize; i++)
+        {
+            size_t xqK = supportKeys[i];
+            expr_type fxq = o1.value(vect_dist_key_dx(xqK));
+            Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+i);
+        }
+        Dfxp = Dfxp * epsInvPow;
+        // T trueDfxp = particles.template getProp<2>(xpK);
+        // Store Dfxp in the right position
+        return Dfxp;
+    }
+    /**
+     * Computes the value of the differential operator for one particle for o1 representing a vector
+     *
+     * \param key particle
+     * \param o1 source property
+     * \param i component
+     * \return the selected derivative
+     *
+     */
+    template<typename op_type>
+    auto computeDifferentialOperator(const vect_dist_key_dx &key,
+                                     op_type &o1,
+                                     int i) -> typename decltype(is_scalar<std::is_fundamental<decltype(o1.value(
+            key))>::value>::analyze(key, o1))::coord_type {
+        typedef typename decltype(is_scalar<std::is_fundamental<decltype(o1.value(key))>::value>::analyze(key, o1))::coord_type expr_type;
+        T sign = 1.0;
+        if (differentialOrder % 2 == 0) {
+            sign = -1;
+        }
+        size_t localKey = subsetKeyPid.get(key.getKey());
+        double eps = localEps.get(localKey);
+        double epsInvPow = localEpsInvPow(localKey);
+        auto &particles = o1.getVector();
+#ifdef SE_CLASS1
+        if(particles.getMapCtr()!=this->getUpdateCtr())
+        {
+            std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a DCPSE operator update after map."<<std::endl;
+        }
+        expr_type Dfxp = 0;
+        size_t xpK = supportRefs.get(localKey);
+        Point<dim, T> xp = particles.getPos(xpK);
+        expr_type fxp = sign * o1.value(key)[i];
+        size_t kerOff = kerOffsets.get(xpK);
+        size_t  supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
+        for (int j = 0; j < supportKeysSize; j++)
+        {
+            size_t xqK = supportKeys[j];
+            expr_type fxq = o1.value(vect_dist_key_dx(xqK))[i];
+            Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+j);
+        }
+        Dfxp = Dfxp * epsInvPow;
+        //
+        //T trueDfxp = particles.template getProp<2>(xpK);
+        // Store Dfxp in the right position
+        return Dfxp;
+    }
+    void initializeUpdate(vector_type &particles)
+    {
+#ifdef SE_CLASS1
+        update_ctr=particles.getMapCtr();
+        kerOffsets.clear();
+        supportKeys1D.clear();
+        supportRefs.clear();
+        localEps.clear();
+        localEpsInvPow.clear();
+        calcKernels.clear();
+        subsetKeyPid.clear();
+        initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
+    }
+    void initializeAdaptive(vector_type &particles,
+                            unsigned int convergenceOrder,
+                            double rCut) {
+        // Still need to be tested
+#ifdef SE_CLASS1
+        this->update_ctr=particles.getMapCtr();
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
+        kerOffsets.resize(particles.size_local()+1);
+        const T condVTOL = 1e2;
+        if (!isSharedSupport) {
+            SupportBuilder<vector_type,vector_type> supportBuilder(particles,particles, differentialSignature, rCut,differentialOrder == 0);
+            unsigned int requiredSupportSize = monomialBasis.size();
+            // need to resize supportKeys1D to yet unknown supportKeysTotalN
+            // add() takes too long
+            openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+            auto it = particles.getDomainIterator();
+            while (it.isNext()) {
+                auto key_o = particles.getOriginKey(it.get());
+                subsetKeyPid.get(key_o.getKey()) = it.get().getKey();
+                Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
+                supportRefs.get(key_o.getKey()) = key_o.getKey();
+                tempSupportKeys.get(key_o.getKey()) = support.getKeys();
+                kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
+                if (maxSupportSize < support.size())
+                    maxSupportSize = support.size();
+                supportKeysTotalN += support.size();
+                EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
+                // Vandermonde matrix computation
+                Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+                        vandermonde(support, monomialBasis, particles,particles);
+                vandermonde.getMatrix(V);
+                T condV = conditionNumber(V, condVTOL);
+                T eps = vandermonde.getEps();
+                if (condV > condVTOL) {
+                    requiredSupportSize *= 2;
+                    std::cout << "INFO: Increasing, requiredSupportSize = " << requiredSupportSize << std::endl; // debug
+                    continue;
+                } else requiredSupportSize = monomialBasis.size();
+                ++it;
+            }
+            kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
+            supportKeys1D.resize(supportKeysTotalN);
+            size_t offset = 0;
+            for (size_t i = 0; i < tempSupportKeys.size(); ++i)
+                for (size_t j = 0; j < tempSupportKeys.get(i).size(); ++j, ++offset)
+                    supportKeys1D.get(offset) = tempSupportKeys.get(i).get(j);
+        }
+        kerOffsets.hostToDevice(); supportKeys1D.hostToDevice();
+        assembleLocalMatrices(cublasDgetrfBatched, cublasDtrsmBatched);
+    }
+    void initializeAdaptive(vector_type &particles,
+                            unsigned int convergenceOrder,
+                            float rCut) {
+        // Still need to be tested
+#ifdef SE_CLASS1
+        this->update_ctr=particles.getMapCtr();
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
+        kerOffsets.resize(particles.size_local()+1);
+        const T condVTOL = 1e2;
+        if (!isSharedSupport) {
+            SupportBuilder<vector_type,vector_type> supportBuilder(particles, particles, differentialSignature, rCut, differentialOrder == 0);
+            unsigned int requiredSupportSize = monomialBasis.size();
+            // need to resize supportKeys1D to yet unknown supportKeysTotalN
+            // add() takes too long
+            openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+            auto it = particles.getDomainIterator();
+            while (it.isNext()) {
+                auto key_o = particles.getOriginKey(it.get());
+                subsetKeyPid.get(key_o.getKey()) = it.get().getKey();
+                Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
+                supportRefs.get(key_o.getKey()) = key_o.getKey();
+                tempSupportKeys.get(key_o.getKey()) = support.getKeys();
+                kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
+                if (maxSupportSize < support.size())
+                    maxSupportSize = support.size();
+                supportKeysTotalN += support.size();
+                EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
+                // Vandermonde matrix computation
+                Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+                        vandermonde(support, monomialBasis, particles);
+                vandermonde.getMatrix(V);
+                T condV = conditionNumber(V, condVTOL);
+                T eps = vandermonde.getEps();
+                if (condV > condVTOL) {
+                    requiredSupportSize *= 2;
+                    std::cout << "INFO: Increasing, requiredSupportSize = " << requiredSupportSize << std::endl; // debug
+                    continue;
+                } else requiredSupportSize = monomialBasis.size();
+                ++it;
+            }
+            kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
+            supportKeys1D.resize(supportKeysTotalN);
+            size_t offset = 0;
+            for (size_t i = 0; i < tempSupportKeys.size(); ++i)
+                for (size_t j = 0; j < tempSupportKeys.get(i).size(); ++j, ++offset)
+                    supportKeys1D.get(offset) = tempSupportKeys.get(i).get(j);
+        }
+        kerOffsets.hostToDevice(); supportKeys1D.hostToDevice();
+        assembleLocalMatrices(cublasSgetrfBatched, cublasStrsmBatched);
+    }
+    void initializeStaticSize(vector_type &particles,
+                              unsigned int convergenceOrder,
+                              double rCut,
+                              double supportSizeFactor) {
+#ifdef SE_CLASS1
+        this->update_ctr=particles.getMapCtr();
+        this->rCut=rCut;
+        this->supportSizeFactor=supportSizeFactor;
+        this->convergenceOrder=convergenceOrder;
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
+std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
+        auto it = particles.getDomainIterator();
+        if (opt==support_options::RADIUS) {
+            if (!isSharedSupport) {
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
+                    supportRefs.get(key_o.getKey()) = key_o.getKey();
+                    ++it;
+                }
+                SupportBuilderGPU<vector_type> supportBuilder(particles, rCut);
+                supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
+            }
+        } else {
+            if (!isSharedSupport){
+                openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+                size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
+                // need to resize supportKeys1D to yet unknown supportKeysTotalN
+                // add() takes too long
+                SupportBuilder<vector_type,vector_type> supportBuilder(particles, particles, differentialSignature, rCut, differentialOrder == 0);
+                kerOffsets.resize(supportRefs.size()+1);
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
+                    Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
+                    supportRefs.get(key_o.getKey()) = key_o.getKey();
+                    tempSupportKeys.get(key_o.getKey()) = support.getKeys();
+                    kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
+                    if (maxSupportSize < support.size()) maxSupportSize = support.size();
+                    supportKeysTotalN += support.size();
+                    ++it;
+                }
+                kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
+                supportKeys1D.resize(supportKeysTotalN);
+                size_t offset = 0;
+                for (size_t i = 0; i < tempSupportKeys.size(); ++i)
+                    for (size_t j = 0; j < tempSupportKeys.get(i).size(); ++j, ++offset)
+                        supportKeys1D.get(offset) = tempSupportKeys.get(i).get(j);
+            }
+            kerOffsets.hostToDevice(); supportKeys1D.hostToDevice();
+        }
+std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
+std::chrono::duration<double> time_span2 = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+std::cout << "Support building took " << time_span2.count() * 1000. << " milliseconds." << std::endl;
+        assembleLocalMatrices(cublasDgetrfBatched, cublasDtrsmBatched);
+    }
+    // ad hoc solution to template specialization for float/double
+    void initializeStaticSize(vector_type &particles,
+                              unsigned int convergenceOrder,
+                              float rCut,
+                              float supportSizeFactor) {
+#ifdef SE_CLASS1
+        this->update_ctr=particles.getMapCtr();
+        this->rCut=rCut;
+        this->supportSizeFactor=supportSizeFactor;
+        this->convergenceOrder=convergenceOrder;
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
+std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
+        auto it = particles.getDomainIterator();
+        if (opt==support_options::RADIUS) {
+            if (!isSharedSupport) {
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
+                    supportRefs.get(key_o.getKey()) = key_o.getKey();
+                    ++it;
+                }
+                SupportBuilderGPU<vector_type> supportBuilder(particles, rCut);
+                supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
+            }
+        } else {
+            if (!isSharedSupport){
+                openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+                size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
+                // need to resize supportKeys1D to yet unknown supportKeysTotalN
+                // add() takes too long
+                SupportBuilder<vector_type,vector_type> supportBuilder(particles, particles, differentialSignature, rCut, differentialOrder == 0);
+                kerOffsets.resize(supportRefs.size()+1);
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
+                    Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
+                    supportRefs.get(key_o.getKey()) = key_o.getKey();
+                    tempSupportKeys.get(key_o.getKey()) = support.getKeys();
+                    kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
+                    if (maxSupportSize < support.size()) maxSupportSize = support.size();
+                    supportKeysTotalN += support.size();
+                    ++it;
+                }
+                kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
+                supportKeys1D.resize(supportKeysTotalN);
+                size_t offset = 0;
+                for (size_t i = 0; i < tempSupportKeys.size(); ++i)
+                    for (size_t j = 0; j < tempSupportKeys.get(i).size(); ++j, ++offset)
+                        supportKeys1D.get(offset) = tempSupportKeys.get(i).get(j);
+            }
+            kerOffsets.hostToDevice(); supportKeys1D.hostToDevice();
+        }
+std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
+std::chrono::duration<double> time_span2 = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+std::cout << "Support building took " << time_span2.count() * 1000. << " milliseconds." << std::endl;
+        assembleLocalMatrices(cublasSgetrfBatched, cublasStrsmBatched);
+    }
+    template<typename cublasLUDec_type, typename cublasTriangSolve_type>
+    void assembleLocalMatrices(cublasLUDec_type cublasLUDecFunc, cublasTriangSolve_type cublasTriangSolveFunc) {
+        std::chrono::high_resolution_clock::time_point t3 = std::chrono::high_resolution_clock::now();
+        // move monomial basis to kernel
+        auto& basis = monomialBasis.getBasis();
+        openfpm::vector_custd<Monomial_gpu<dim>> basisTemp(basis.begin(), basis.end());
+        basisTemp.template hostToDevice();
+        MonomialBasis<dim, aggregate<Monomial_gpu<dim>>, openfpm::vector_custd_ker, memory_traits_inte> monomialBasisKernel(basisTemp.toKernel());
+        size_t numMatrices = supportRefs.size();
+        size_t monomialBasisSize = monomialBasis.size();
+        int numSMs, numSMsMult = 1;
+        cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
+        size_t numThreads = numSMs*numSMsMult*256;
+        std::cout << "numThreads " << numThreads << " numMatrices " << numMatrices << std::endl;
+        // B is an intermediate matrix
+        openfpm::vector_custd<T> BMat(numThreads * maxSupportSize * monomialBasisSize);
+        // allocate device space for A, b
+        openfpm::vector_custd<T> AMat(numMatrices*monomialBasisSize*monomialBasisSize);
+        openfpm::vector_custd<T> bVec(numMatrices*monomialBasisSize);
+        // create array of pointers to pass T** pointers to cublas subroutines
+        openfpm::vector_custd<T*> AMatPointers(numMatrices);
+        openfpm::vector_custd<T*> bVecPointers(numMatrices);
+        auto AMatKernel = AMat.toKernel(); T* AMatKernelPointer = (T*) AMatKernel.getPointer();
+        for (size_t i = 0; i < numMatrices; i++) AMatPointers.get(i) = AMatKernelPointer + i*monomialBasisSize*monomialBasisSize;
+        auto bVecKernel = bVec.toKernel(); T* bVecKernelPointer = (T*) bVecKernel.getPointer();
+        for (size_t i = 0; i < numMatrices; i++) bVecPointers.get(i) = bVecKernelPointer + i*monomialBasisSize;
+        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span0 = std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t3);
+        std::cout << "Preallocation took " << time_span0.count() * 1000. << " milliseconds." << std::endl;
+        // assemble local matrices on GPU
+        std::chrono::high_resolution_clock::time_point t9 = std::chrono::high_resolution_clock::now();
+        particles.hostToDevicePos();
+        supportRefs.template hostToDevice();
+        AMatPointers.template hostToDevice();
+        bVecPointers.template hostToDevice();
+        auto AMatPointersKernel = AMatPointers.toKernel(); T** AMatPointersKernelPointer = (T**) AMatPointersKernel.getPointer();
+        auto bVecPointersKernel = bVecPointers.toKernel(); T** bVecPointersKernelPointer = (T**) bVecPointersKernel.getPointer();
+        assembleLocalMatrices_gpu<<<numSMsMult*numSMs, 256>>>(particles.toKernel(), differentialSignature, differentialOrder, monomialBasisKernel, supportRefs.toKernel(), kerOffsets.toKernel(), supportKeys1D.toKernel(),
+            AMatPointersKernelPointer, bVecPointersKernelPointer, localEps.toKernel(), localEpsInvPow.toKernel(), BMat.toKernel(), numMatrices, maxSupportSize);
+        localEps.template deviceToHost();
+        localEpsInvPow.template deviceToHost();
+        std::chrono::high_resolution_clock::time_point t10 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span3 = std::chrono::duration_cast<std::chrono::duration<double>>(t10 - t9);
+        std::cout << "assembleLocalMatrices_gpu took " << time_span3.count() * 1000. << " milliseconds." << std::endl;
+        //cublas lu solver
+        std::chrono::high_resolution_clock::time_point t7 = std::chrono::high_resolution_clock::now();
+        cublasHandle_t cublas_handle; cublasCreate_v2(&cublas_handle);
+        openfpm::vector_custd<int> infoArray(numMatrices); auto infoArrayKernel = infoArray.toKernel();
+        cublasLUDecFunc(cublas_handle, monomialBasisSize, AMatPointersKernelPointer, monomialBasisSize, NULL, (int*) infoArrayKernel.getPointer(), numMatrices);
+        cudaDeviceSynchronize();
+        infoArray.template deviceToHost();
+        for (size_t i = 0; i < numMatrices; i++)
+            if (infoArray.get(i) != 0) fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
+        const double alpha = 1.f;
+        cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
+        cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
+        cudaDeviceSynchronize();
+        std::chrono::high_resolution_clock::time_point t8 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span4 = std::chrono::duration_cast<std::chrono::duration<double>>(t8 - t7);
+        std::cout << "cublas took " << time_span4.count() * 1000. << " milliseconds." << std::endl;
+        std::chrono::high_resolution_clock::time_point t5 = std::chrono::high_resolution_clock::now();
+        // populate the calcKernels on GPU
+        calcKernels.resize(supportKeysTotalN);
+        localEps.template hostToDevice();
+        auto it2 = particles.getDomainIteratorGPU(512);
+        calcKernels_gpu<dim><<<it2.wthr,it2.thr>>>(particles.toKernel(), monomialBasisKernel, kerOffsets.toKernel(), supportKeys1D.toKernel(), bVecPointersKernelPointer, localEps.toKernel(), numMatrices, calcKernels.toKernel());
+        calcKernels.template deviceToHost();
+        std::chrono::high_resolution_clock::time_point t6 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span5 = std::chrono::duration_cast<std::chrono::duration<double>>(t6 - t5);
+        std::cout << "calcKernels_gpu took " << time_span5.count() * 1000. << " milliseconds." << std::endl;
+        // free the resources
+        cublasDestroy_v2(cublas_handle);
+        std::chrono::high_resolution_clock::time_point t4 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t4 - t3);
+        std::cout << "Matrices inverse took " << time_span.count() * 1000. << " milliseconds." << std::endl;
+    }
+    T computeKernel(Point<dim, T> x, EMatrix<T, Eigen::Dynamic, 1> & a) const {
+        unsigned int counter = 0;
+        T res = 0, expFactor = exp(-norm2(x));
+        size_t N = monomialBasis.getElements().size();
+        for (size_t i = 0; i < N; ++i)
+        {
+            const Monomial<dim> &m = monomialBasis.getElement(i);
+            T coeff = a(counter);
+            T mbValue = m.evaluate(x);
+            res += coeff * mbValue * expFactor;
+            ++counter;
+        }
+        return res;
+    }
+    // template <unsigned int a_dim>
+    // T computeKernel(Point<dim, T> x, const T (& a) [a_dim]) const {
+    T computeKernel(Point<dim, T> x, const T* a) const {
+        unsigned int counter = 0;
+        T res = 0, expFactor = exp(-norm2(x));
+        size_t N = monomialBasis.getElements().size();
+        for (size_t i = 0; i < N; ++i)
+        {
+            const Monomial<dim> &m = monomialBasis.getElement(i);
+            T coeff = a[counter];
+            T mbValue = m.evaluate(x);
+            res += coeff * mbValue * expFactor;
+            ++counter;
+        }
+        return res;
+    }
+    T conditionNumber(const EMatrix<T, -1, -1> &V, T condTOL) const {
+        std::cout << "conditionNumber" << std::endl;
+        Eigen::JacobiSVD<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> svd(V);
+        T cond = svd.singularValues()(0)
+                 / svd.singularValues()(svd.singularValues().size() - 1);
+        if (cond > condTOL) {
+            std::cout
+                    << "WARNING: cond(V) = " << cond
+                    << " is greater than TOL = " << condTOL
+                    << ",  numPoints(V) = " << V.rows()
+                    << std::endl; // debug
+        }
+        return cond;
+    }
+template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
+__global__ void assembleLocalMatrices_gpu(
+        particles_type particles, Point<dim, unsigned int> differentialSignature, unsigned int differentialOrder, monomialBasis_type monomialBasis, 
+        supportKey_type supportRefs, supportKey_type kerOffsets, supportKey_type supportKeys1D, T** h_A, T** h_b, localEps_type localEps, localEps_type localEpsInvPow,
+        matrix_type BMat, size_t numMatrices, size_t maxSupportSize)
+    {
+    auto p_key = GET_PARTICLE(particles);
+    size_t monomialBasisSize = monomialBasis.size();
+    size_t BStartPos = maxSupportSize * monomialBasisSize * p_key; T* B = &((T*)BMat.getPointer())[BStartPos];
+    const auto& basisElements = monomialBasis.getElements();
+    int rhsSign = (Monomial_gpu<dim>(differentialSignature).order() % 2 == 0) ? 1 : -1;
+    for (; 
+        p_key < numMatrices; 
+        p_key += blockDim.x * gridDim.x) 
+    {
+        Point<dim, T> xa = particles.getPos(p_key);
+        size_t  supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
+        size_t  xpK = supportRefs.get(p_key);
+    assert(supportKeysSize >= monomialBasis.size());
+        T FACTOR = 2, avgNeighbourSpacing = 0;
+        for (int i = 0 ; i < supportKeysSize; i++) {
+            Point<dim,T> off = xa; off -= particles.getPosOrig(supportKeys[i]);
+            for (size_t j = 0; j < dim; ++j)
+                avgNeighbourSpacing += fabs(off.value(j));
+        }
+        avgNeighbourSpacing /= supportKeysSize;
+        T eps = FACTOR * avgNeighbourSpacing;
+    assert(eps != 0);
+        localEps.get(p_key) = eps;
+        localEpsInvPow.get(p_key) = 1.0 / pow(eps,differentialOrder);
+        // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
+        for (int i = 0; i < supportKeysSize; ++i)
+            for (int j = 0; j < monomialBasisSize; ++j) {
+                Point<dim,T> off = xa; off -= particles.getPosOrig(supportKeys[i]);
+                const Monomial_gpu<dim>& m = basisElements.get(j);
+                T V_ij = m.evaluate(off) / pow(eps, m.order());
+                T E_ii = exp(- norm2(off) / (2.0 * eps * eps));
+                B[i*monomialBasisSize+j] = E_ii * V_ij;
+            }
+        T sum = 0.0;
+        // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> A = B.transpose() * B;
+        for (int i = 0; i < monomialBasisSize; ++i)
+            for (int j = 0; j < monomialBasisSize; ++j) {
+                for (int k = 0; k < supportKeysSize; ++k)
+                    sum += B[k*monomialBasisSize+i] * B[k*monomialBasisSize+j];
+                h_A[p_key][i*monomialBasisSize+j] = sum; sum = 0.0;
+            }
+        // Compute RHS vector b
+        for (size_t i = 0; i < monomialBasisSize; ++i) {
+            const Monomial_gpu<dim>& dm = basisElements.get(i).getDerivative(differentialSignature);
+            h_b[p_key][i] = rhsSign * dm.evaluate(Point<dim, T>(0));
+        }
+    }
+template<unsigned int dim, typename particles_type, typename T, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename calcKernels_type>
+__global__ void calcKernels_gpu(particles_type particles, monomialBasis_type monomialBasis, supportKey_type kerOffsets, supportKey_type supportKeys1D,
+        T** h_b, localEps_type localEps, size_t numMatrices, calcKernels_type calcKernels)
+    {
+    auto p_key = GET_PARTICLE(particles);
+    Point<dim, T> xa = particles.getPos(p_key);
+    size_t  monomialBasisSize = monomialBasis.size();
+    const auto& basisElements = monomialBasis.getElements();
+    size_t  supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
+    size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
+    T* calcKernelsLocal = &((T*)calcKernels.getPointer())[kerOffsets.get(p_key)];
+    T eps = localEps.get(p_key);
+    for (size_t j = 0; j < supportKeysSize; ++j)
+    {
+        size_t xqK = supportKeys[j];
+        Point<dim, T> xq = particles.getPosOrig(xqK);
+        Point<dim, T> offNorm = (xa - xq) / eps;
+        T expFactor = exp(-norm2(offNorm));
+        T res = 0;
+        for (size_t i = 0; i < monomialBasisSize; ++i) {
+            const Monomial_gpu<dim> &m = basisElements.get(i);
+            T mbValue = m.evaluate(offNorm);
+            T coeff = h_b[p_key][i];
+            res += coeff * mbValue * expFactor;
+        }
+        calcKernelsLocal[j] = res;
+    }
diff --git a/src/DCPSE/Dcpse.hpp b/src/DCPSE/Dcpse.hpp
index b88d514e..c43987fe 100644
--- a/src/DCPSE/Dcpse.hpp
+++ b/src/DCPSE/Dcpse.hpp
@@ -1,6 +1,7 @@
-// Created by tommaso on 29/03/19.
-// Modified by Abhinav and Pietro
+// DCPSE Created by tommaso on 29/03/19.
+// Modified, Updated and Maintained by Abhinav and Pietro
+//Surface Operators by Abhinav Singh on 07/10/2021
@@ -14,6 +15,9 @@
 #include "Vandermonde.hpp"
 #include "DcpseDiagonalScalingMatrix.hpp"
 #include "DcpseRhs.hpp"
+#include "hash_map/hopscotch_map.h"
+template<unsigned int N> struct value_t {};
 template<bool cond>
 struct is_scalar {
@@ -35,7 +39,7 @@ struct is_scalar<false> {
-template<unsigned int dim, typename vector_type>
+template<unsigned int dim, typename vector_type,typename vector_type2=vector_type>
 class Dcpse {
@@ -53,27 +57,110 @@ public:
     // 2) The machinery for assembling and solving the linear system for coefficients starts...
     // 3) The user can then call an evaluate(point) method to get the evaluation of the differential operator
     //    on the given point.
+    ////c=HOverEpsilon. Note that the Eps value is computed by <h>/c (<h>=local average spacing for each particle and its support). This factor c is used in the Vandermonde.hpp.
+    double HOverEpsilon=0.9;
     const Point<dim, unsigned int> differentialSignature;
     const unsigned int differentialOrder;
     const MonomialBasis<dim> monomialBasis;
-    //std::vector<EMatrix<T, Eigen::Dynamic, 1>> localCoefficients; // Each MPI rank has just access to the local ones
-    std::vector<Support> localSupports; // Each MPI rank has just access to the local ones
-    std::vector<T> localEps; // Each MPI rank has just access to the local ones
-    std::vector<T> localEpsInvPow; // Each MPI rank has just access to the local ones
-    std::vector<T> localSumA;
-    openfpm::vector<size_t> kerOffsets;
+    bool isSharedLocalSupport = false;
+    openfpm::vector<Support> localSupports; // Each MPI rank has just access to the local ones
+    openfpm::vector<T> localEps; // Each MPI rank has just access to the local ones
+    openfpm::vector<T> localEpsInvPow; // Each MPI rank has just access to the local ones
+    openfpm::vector<size_t> kerOffsets,accKerOffsets;
     openfpm::vector<T> calcKernels;
+    openfpm::vector<T> accCalcKernels;
+    openfpm::vector<T> nSpacings;
+    vector_type & particlesFrom;
+    vector_type2 & particlesTo;
+    double rCut,supportSizeFactor=1,nSpacing,AdapFac;
+    unsigned int convergenceOrder,nCount;
-    vector_type & particles;
-    double rCut;
-    unsigned int convergenceOrder;
-    double supportSizeFactor;
+    bool isSurfaceDerivative=false;
+    size_t initialParticleSize;
-    support_options opt;
+    support_options opt;
+    template<unsigned int NORMAL_ID>
+    void createNormalParticles(vector_type &particles)
+    {
+        particles.template ghost_get<NORMAL_ID>(SKIP_LABELLING);
+        initialParticleSize=particles.size_local_with_ghost();
+        auto it = particles.getDomainAndGhostIterator();
+        while(it.isNext()){
+            auto key=it.get();
+            Point<dim,T> xp=particles.getPos(key), Normals=particles.template getProp<NORMAL_ID>(key);
+            if(opt==support_options::ADAPTIVE)
+            {
+                nSpacing=nSpacings.get(key.getKey());
+            }
+            for(int i=1;i<=nCount;i++){
+                particles.addAtEnd();
+                for(size_t j=0;j<dim;j++)
+                {particles.getLastPosEnd()[j]=xp[j]+i*nSpacing*Normals[j];}
+                particles.addAtEnd();
+                for(size_t j=0;j<dim;j++)
+                {particles.getLastPosEnd()[j]=xp[j]-i*nSpacing*Normals[j];}
+            }
+            ++it;
+        }
+    }
+    void accumulateAndDeleteNormalParticles(vector_type &particles)
+    {
+        tsl::hopscotch_map<size_t, size_t> nMap;
+        auto it = particles.getDomainIterator();
+        auto supportsIt = localSupports.begin();
+        openfpm::vector_std<size_t> supportBuffer;
+        accCalcKernels.clear();
+        accKerOffsets.clear();
+        accKerOffsets.resize(initialParticleSize);
+        accKerOffsets.fill(-1);
+        while(it.isNext()){
+            supportBuffer.clear();
+            nMap.clear();
+            auto key=it.get();
+            Support support = *supportsIt;
+            size_t xpK = support.getReferencePointKey();
+            size_t kerOff = kerOffsets.get(xpK);
+            auto &keys = support.getKeys();
+            accKerOffsets.get(xpK)=accCalcKernels.size();
+            for (int i = 0 ; i < keys.size() ; i++)
+            {
+                size_t xqK = keys.get(i);
+                int real_particle=(xqK-initialParticleSize)/(2.*nCount);
+                if(real_particle<0)
+                    {
+                        real_particle=xqK;
+                    }
+                auto found=nMap.find(real_particle);
+                if(found!=nMap.end()){
+                    accCalcKernels.get(found->second)+=calcKernels.get(kerOff+i);
+                }
+                else{
+                    supportBuffer.add();
+                    supportBuffer.get(supportBuffer.size()-1)=real_particle;
+                    accCalcKernels.add();
+                    accCalcKernels.get(accCalcKernels.size()-1)=calcKernels.get(kerOff+i);
+                    nMap[real_particle]=accCalcKernels.size()-1;
+                }
+            }
+            keys.swap(supportBuffer);
+            localSupports.get(xpK) = support;
+            ++supportsIt;
+            ++it;
+        }
+        particles.resizeAtEnd(initialParticleSize);
+        localEps.resize(initialParticleSize);
+        localEpsInvPow.resize(initialParticleSize);
+        localSupports.resize(initialParticleSize);
+        calcKernels.swap(accCalcKernels);
+        kerOffsets.swap(accKerOffsets);
+    }
 #ifdef SE_CLASS1
     int getUpdateCtr() const
@@ -89,36 +176,124 @@ public:
           T rCut,
           T supportSizeFactor = 1,                               //Maybe change this to epsilon/h or h/epsilon = c 0.9. Benchmark
           support_options opt = support_options::RADIUS)
-		:particles(particles),
+		:particlesFrom(particles),
+         particlesTo(particles),
             monomialBasis(differentialSignature.asArray(), convergenceOrder),
-        // This 
-        particles.ghost_get_subset();
+        particles.ghost_get_subset();         // This communicates which ghost particles to be excluded from support
         if (supportSizeFactor < 1) 
-            initializeAdaptive(particles, convergenceOrder, rCut);
+            initializeAdaptive(particles, particles, convergenceOrder, rCut);
-            initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
+            initializeStaticSize(particles, particles, convergenceOrder, rCut, supportSizeFactor);
+    //Surface DCPSE Constructor
+    template<unsigned int NORMAL_ID>
+    Dcpse(vector_type &particles,
+          Point<dim, unsigned int> differentialSignature,
+          unsigned int convergenceOrder,
+          T rCut,
+          T nSpacing,
+          value_t< NORMAL_ID >,
+          support_options opt = support_options::RADIUS)
+		:particlesFrom(particles),
+         particlesTo(particles),
+            differentialSignature(differentialSignature),
+            differentialOrder(Monomial<dim>(differentialSignature).order()),
+            monomialBasis(differentialSignature.asArray(), convergenceOrder),
+            opt(opt),isSurfaceDerivative(true),nSpacing(nSpacing),nCount(floor(rCut/nSpacing))
+    {
+        particles.ghost_get_subset();         // This communicates which ghost particles to be excluded from support
+         if(opt==support_options::ADAPTIVE) {
+             this->AdapFac=nSpacing;
+             if(dim==2){
+                 nCount=3;
+             }
+             else{
+                 nCount=2;
+             }
+             SupportBuilder<vector_type,vector_type2>
+                supportBuilder(particlesFrom,particlesTo, differentialSignature, rCut, differentialOrder == 0);
+                supportBuilder.setAdapFac(nSpacing);
+                auto it = particlesTo.getDomainAndGhostIterator();
+                while (it.isNext()) {
+                    auto key_o = particlesTo.getOriginKey(it.get());
+                    Support support = supportBuilder.getSupport(it,monomialBasis.size(),opt);
+                    nSpacings.add(supportBuilder.getLastMinspacing());
+                    ++it;
+                  }
+         }
+         if(opt!=support_options::LOAD) {
+             createNormalParticles<NORMAL_ID>(particles);
+#ifdef SE_CLASS1
+             particles.write("WithNormalParticlesQC");
+         }
+        initializeStaticSize(particles, particles, convergenceOrder, rCut, supportSizeFactor);
+         if(opt!=support_options::LOAD) {
+             accumulateAndDeleteNormalParticles(particles);
+         }
+    }
+    Dcpse(vector_type &particles,
+          const Dcpse<dim, vector_type>& other,
+          Point<dim, unsigned int> differentialSignature,
+          unsigned int convergenceOrder,
+          T rCut,
+          T supportSizeFactor = 1,
+          support_options opt = support_options::RADIUS)
+        :particlesFrom(particles), particlesTo(particles), opt(opt),
+            differentialSignature(differentialSignature),
+            differentialOrder(Monomial<dim>(differentialSignature).order()),
+            monomialBasis(differentialSignature.asArray(), convergenceOrder),
+            localSupports(other.localSupports),
+            isSharedLocalSupport(true)
+    {
+        particles.ghost_get_subset();
+        if (supportSizeFactor < 1)
+            initializeAdaptive(particles, particles, convergenceOrder, rCut);
+        else
+            initializeStaticSize(particles, particles, convergenceOrder, rCut, supportSizeFactor);
+    }
+    Dcpse(vector_type &particlesFrom,vector_type2 &particlesTo,
+          Point<dim, unsigned int> differentialSignature,
+          unsigned int convergenceOrder,
+          T rCut,
+          T supportSizeFactor = 1,
+          support_options opt = support_options::RADIUS)
+            :particlesFrom(particlesFrom),particlesTo(particlesTo),
+             differentialSignature(differentialSignature),
+             differentialOrder(Monomial<dim>(differentialSignature).order()),
+             monomialBasis(differentialSignature.asArray(), convergenceOrder),
+             opt(opt)
+    {
+        particlesFrom.ghost_get_subset();
+        if (supportSizeFactor < 1)
+            initializeAdaptive(particlesFrom,particlesTo,convergenceOrder, rCut);
+        else
+            initializeStaticSize(particlesFrom,particlesTo,convergenceOrder, rCut, supportSizeFactor);
+    }
     template<unsigned int prp>
     void DrawKernel(vector_type &particles, int k)
-        Support support = localSupports[k];
+        Support support = localSupports.get(k);
         size_t xpK = k;
         size_t kerOff = kerOffsets.get(k);
         auto & keys = support.getKeys();
         for (int i = 0 ; i < keys.size() ; i++)
-        	size_t xqK = keys[i];
+            size_t xqK = keys.get(i);
             particles.template getProp<prp>(xqK) += calcKernels.get(kerOff+i);
@@ -126,13 +301,13 @@ public:
     template<unsigned int prp>
     void DrawKernelNN(vector_type &particles, int k)
-        Support support = localSupports[k];
+        Support support = localSupports.get(k);
         size_t xpK = k;
         size_t kerOff = kerOffsets.get(k);
         auto & keys = support.getKeys();
         for (int i = 0 ; i < keys.size() ; i++)
-        	size_t xqK = keys[i];
+            size_t xqK = keys.get(i);
             particles.template getProp<prp>(xqK) = 1.0;
@@ -140,16 +315,138 @@ public:
     template<unsigned int prp>
     void DrawKernel(vector_type &particles, int k, int i)
-        Support support = localSupports[k];
+        Support support = localSupports.get(k);
         size_t xpK = k;
         size_t kerOff = kerOffsets.get(k);
         auto & keys = support.getKeys();
         for (int i = 0 ; i < keys.size() ; i++)
-        	size_t xqK = keys[i];
+            size_t xqK = keys.get(i);
             particles.template getProp<prp>(xqK)[i] += calcKernels.get(kerOff+i);
+    /*
+     * breif Particle to Particle Interpolation Evaluation
+     */
+    template<unsigned int prp1,unsigned int prp2>
+    void p2p()
+    {
+        typedef typename std::remove_reference<decltype(particlesTo.template getProp<prp2>(0))>::type T2;
+        auto it = particlesTo.getDomainIterator();
+        auto supportsIt = localSupports.begin();
+        auto epsItInvPow = localEpsInvPow.begin();
+        while (it.isNext()){
+            double epsInvPow = *epsItInvPow;
+            T2 Dfxp = 0;
+            Support support = *supportsIt;
+            size_t xpK = support.getReferencePointKey();
+            //Point<dim, typename vector_type::stype> xp = particlesTo.getPos(xpK);
+            //T fxp = sign * particlesTo.template getProp<fValuePos>(xpK);
+            size_t kerOff = kerOffsets.get(xpK);
+            auto & keys = support.getKeys();
+            for (int i = 0 ; i < keys.size() ; i++)
+            {
+                size_t xqK = keys.get(i);
+                T2 fxq = particlesFrom.template getProp<prp1>(xqK);
+                Dfxp += fxq * calcKernels.get(kerOff+i);
+            }
+            Dfxp = epsInvPow*Dfxp;
+            //
+            //T trueDfxp = particles.template getProp<2>(xpK);
+            // Store Dfxp in the right position
+            particlesTo.template getProp<prp2>(xpK) = Dfxp;
+            //
+            ++it;
+            ++supportsIt;
+            ++epsItInvPow;
+        }
+    }
+    /*! \brief Save the DCPSE computations
+     *
+     */
+    void save(const std::string &file){
+        auto & v_cl=create_vcluster();
+        size_t req = 0;
+		Packer<decltype(localSupports),HeapMemory>::packRequest(localSupports,req);
+        Packer<decltype(localEps),HeapMemory>::packRequest(localEps,req);
+        Packer<decltype(localEpsInvPow),HeapMemory>::packRequest(localEpsInvPow,req);
+        Packer<decltype(calcKernels),HeapMemory>::packRequest(calcKernels,req);
+        Packer<decltype(kerOffsets),HeapMemory>::packRequest(kerOffsets,req);
+		// allocate the memory
+		HeapMemory pmem;
+		//pmem.allocate(req);
+		ExtPreAlloc<HeapMemory> mem(req,pmem);
+		//Packing
+		Pack_stat sts;
+		Packer<decltype(localSupports),HeapMemory>::pack(mem,localSupports,sts);
+        Packer<decltype(localEps),HeapMemory>::pack(mem,localEps,sts);
+        Packer<decltype(localEpsInvPow),HeapMemory>::pack(mem,localEpsInvPow,sts);
+        Packer<decltype(calcKernels),HeapMemory>::pack(mem,calcKernels,sts);
+        Packer<decltype(kerOffsets),HeapMemory>::pack(mem,kerOffsets,sts);
+		// Save into a binary file
+	    std::ofstream dump (file+"_"+std::to_string(v_cl.rank()), std::ios::out | std::ios::binary);
+	    if (dump.is_open() == false)
+        {   std::cerr << __FILE__ << ":" << __LINE__ <<" Unable to write since dump is open at rank "<<v_cl.rank()<<std::endl;
+	    	return;
+            }
+	    dump.write ((const char *)pmem.getPointer(), pmem.size());
+	    return;
+    }
+    /*! \brief Load the DCPSE computations
+     *
+     *
+     */
+    void load(const std::string & file)
+	{
+        auto & v_cl=create_vcluster();
+	    std::ifstream fs (file+"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary | std::ios::ate );
+	    if (fs.is_open() == false)
+	    {
+	    	std::cerr << __FILE__ << ":" << __LINE__ << " error, opening file: " << file << std::endl;
+	    	return;
+	    }
+	    // take the size of the file
+	    size_t sz = fs.tellg();
+	    fs.close();
+	    // reopen the file without ios::ate to read
+	    std::ifstream input (file+"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary );
+	    if (input.is_open() == false)
+        {//some message here maybe
+	    	return;}
+	    // Create the HeapMemory and the ExtPreAlloc memory
+	    size_t req = 0;
+	    req += sz;
+	    HeapMemory pmem;
+		ExtPreAlloc<HeapMemory> mem(req,pmem);
+		mem.allocate(pmem.size());
+		// read
+ *)pmem.getPointer(), sz);
+	    //close the file
+	    input.close();
+		//Unpacking
+		Unpack_stat ps;
+	 	Unpacker<decltype(localSupports),HeapMemory>::unpack(mem,localSupports,ps);
+        Unpacker<decltype(localEps),HeapMemory>::unpack(mem,localEps,ps);
+        Unpacker<decltype(localEpsInvPow),HeapMemory>::unpack(mem,localEpsInvPow,ps);
+        Unpacker<decltype(calcKernels),HeapMemory>::unpack(mem,calcKernels,ps);
+        Unpacker<decltype(kerOffsets),HeapMemory>::unpack(mem,kerOffsets,ps);
+	 	return;
+	}
     void checkMomenta(vector_type &particles)
@@ -184,23 +481,24 @@ public:
             auto & keys = support.getKeys();
             for (int i = 0 ; i < keys.size() ; i++)
-            	size_t xqK = keys[i];
+                size_t xqK = keys.get(i);
                 Point<dim, T> xq = particles.getPosOrig(xqK);
                 Point<dim, T> normalizedArg = (xp - xq) / eps;
                 auto ker = calcKernels.get(kerOff+i);
                 int counter = 0;
-                for (const Monomial<dim> &m : monomialBasis.getElements())
-                {
-                    T mbValue = m.evaluate(normalizedArg);
+                size_t N = monomialBasis.getElements().size();
+                for (size_t i = 0; i < N; ++i)
+                {
+                    const Monomial<dim> &m = monomialBasis.getElement(i);
+                    T mbValue = m.evaluate(normalizedArg);
                     momenta_accu.template get<0>(counter) += mbValue * ker;
             for (int i = 0 ; i < momenta.size() ; i++)
@@ -222,9 +520,9 @@ public:
-        for (int i = 0 ; i < momenta.size() ; i++)
+        for (size_t i = 0 ; i < momenta.size() ; i++)
-            std::cout << "MOMENTA: " << monomialBasis.getElements()[i] << "Min: " << momenta.template get<0>(i) << "  " << "Max: " << momenta.template get<1>(i) << std::endl;
+            std::cout << "MOMENTA: " << monomialBasis.getElement(i) << "Min: " << momenta.template get<0>(i) << "  " << "Max: " << momenta.template get<1>(i) << std::endl;
@@ -252,13 +550,13 @@ public:
             T Dfxp = 0;
             Support support = *supportsIt;
             size_t xpK = support.getReferencePointKey();
-            Point<dim, typename vector_type::stype> xp = particles.getPos(support.getReferencePointKey());
+            //Point<dim, typename vector_type::stype> xp = particles.getPos(support.getReferencePointKey());
             T fxp = sign * particles.template getProp<fValuePos>(xpK);
             size_t kerOff = kerOffsets.get(xpK);
             auto & keys = support.getKeys();
             for (int i = 0 ; i < keys.size() ; i++)
-            	size_t xqK = keys[i];
+                size_t xqK = keys.get(i);
                 T fxq = particles.template getProp<fValuePos>(xqK);
                 Dfxp += (fxq + fxp) * calcKernels.get(kerOff+i);
@@ -283,7 +581,7 @@ public:
     inline int getNumNN(const vect_dist_key_dx &key)
-        return localSupports[key.getKey()].size();
+        return localSupports.get(key.getKey()).size();
     /*! \brief Get the coefficent j (Neighbour) of the particle key
@@ -307,14 +605,14 @@ public:
     inline size_t getIndexNN(const vect_dist_key_dx &key, int j)
-        return localSupports[key.getKey()].getKeys()[j];
+        return localSupports.get(key.getKey()).getKeys().get(j);
     inline T getSign()
         T sign = 1.0;
-        if (differentialOrder % 2 == 0) {
+        if (differentialOrder % 2 == 0 && differentialOrder!=0) {
             sign = -1;
@@ -323,7 +621,7 @@ public:
     T getEpsilonInvPrefactor(const vect_dist_key_dx &key)
-        return localEpsInvPow[key.getKey()];
+        return localEpsInvPow.get(key.getKey());
@@ -346,8 +644,8 @@ public:
             sign = -1;
-        double eps = localEps[key.getKey()];
-        double epsInvPow = localEpsInvPow[key.getKey()];
+        double eps = localEps.get(key.getKey());
+        double epsInvPow = localEpsInvPow.get(key.getKey());
         auto &particles = o1.getVector();
@@ -359,15 +657,15 @@ public:
         expr_type Dfxp = 0;
-        Support support = localSupports[key.getKey()];
+        Support support = localSupports.get(key.getKey());
         size_t xpK = support.getReferencePointKey();
-        Point<dim, T> xp = particles.getPos(xpK);
+        //Point<dim, T> xp = particles.getPos(xpK);
         expr_type fxp = sign * o1.value(key);
         size_t kerOff = kerOffsets.get(xpK);
         auto & keys = support.getKeys();
         for (int i = 0 ; i < keys.size() ; i++)
-        	size_t xqK = keys[i];
+            size_t xqK = keys.get(i);
             expr_type fxq = o1.value(vect_dist_key_dx(xqK));
             Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+i);
@@ -402,11 +700,10 @@ public:
             sign = -1;
-        double eps = localEps[key.getKey()];
-        double epsInvPow = localEpsInvPow[key.getKey()];
+        double eps = localEps.get(key.getKey());
+        double epsInvPow = localEpsInvPow.get(key.getKey());
         auto &particles = o1.getVector();
 #ifdef SE_CLASS1
@@ -415,15 +712,15 @@ public:
         expr_type Dfxp = 0;
-        Support support = localSupports[key.getKey()];
+        Support support = localSupports.get(key.getKey());
         size_t xpK = support.getReferencePointKey();
-        Point<dim, T> xp = particles.getPos(xpK);
+        //Point<dim, T> xp = particles.getPos(xpK);
         expr_type fxp = sign * o1.value(key)[i];
         size_t kerOff = kerOffsets.get(xpK);
         auto & keys = support.getKeys();
         for (int j = 0 ; j < keys.size() ; j++)
-        	size_t xqK = keys[j];
+            size_t xqK = keys.get(j);
             expr_type fxq = o1.value(vect_dist_key_dx(xqK))[i];
             Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+j);
@@ -434,128 +731,93 @@ public:
         return Dfxp;
-    void initializeUpdate(vector_type &particles)
+    void initializeUpdate(vector_type &particlesFrom,vector_type2 &particlesTo)
 #ifdef SE_CLASS1
-        update_ctr=particles.getMapCtr();
+        update_ctr=particlesFrom.getMapCtr();
-        localSupports.resize(particles.size_local_orig());
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
-        kerOffsets.resize(particles.size_local_orig());
-        kerOffsets.fill(-1);
-        SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
-        unsigned int requiredSupportSize = monomialBasis.size() * supportSizeFactor;
-        auto it = particles.getDomainIterator();
-        while (it.isNext()) {
-            // Get the points in the support of the DCPSE kernel and store the support for reuse
-            //Support<vector_type> support = supportBuilder.getSupport(it, requiredSupportSize,opt);
-            Support support = supportBuilder.getSupport(it, requiredSupportSize,opt);
-            EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
-            auto key_o = particles.getOriginKey(it.get());
-            // Vandermonde matrix computation
-            Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-                    vandermonde(support, monomialBasis,particles);
-            vandermonde.getMatrix(V);
-            T eps = vandermonde.getEps();
-            localSupports[key_o.getKey()] = support;
-            localEps[key_o.getKey()] = eps;
-            localEpsInvPow[key_o.getKey()] = 1.0 / openfpm::math::intpowlog(eps,differentialOrder);
-            // Compute the diagonal matrix E
-            DcpseDiagonalScalingMatrix<dim> diagonalScalingMatrix(monomialBasis);
-            EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> E(support.size(), support.size());
-            diagonalScalingMatrix.buildMatrix(E, support, eps, particles);
-            // Compute intermediate matrix B
-            EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
-            // Compute matrix A
-            EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> A = B.transpose() * B;
-            // Compute RHS vector b
-            DcpseRhs<dim> rhs(monomialBasis, differentialSignature);
-            EMatrix<T, Eigen::Dynamic, 1> b(monomialBasis.size(), 1);
-            rhs.template getVector<T>(b);
-            // Get the vector where to store the coefficients...
-            EMatrix<T, Eigen::Dynamic, 1> a(monomialBasis.size(), 1);
-            // ...solve the linear system...
-            a = A.colPivHouseholderQr().solve(b);
-            // ...and store the solution for later reuse
-            kerOffsets.get(key_o.getKey()) = calcKernels.size();
+        initializeStaticSize(particlesFrom,particlesTo, convergenceOrder, rCut, supportSizeFactor);
+    }
-            Point<dim, T> xp = particles.getPosOrig(key_o);
+    void initializeUpdate(vector_type &particles)
+    {
+#ifdef SE_CLASS1
+        update_ctr=particles.getMapCtr();
-            for (auto &xqK : support.getKeys())
-            {
-                Point<dim, T> xq = particles.getPosOrig(xqK);
-                Point<dim, T> normalizedArg = (xp - xq) / eps;
+        localSupports.clear();
+        localEps.clear();
+        localEpsInvPow.clear();
+        calcKernels.clear();
+        kerOffsets.clear();
-                calcKernels.add(computeKernel(normalizedArg, a));
-            }
-            //
-            ++it;
-        }
+        initializeStaticSize(particles,particles, convergenceOrder, rCut, supportSizeFactor);
-    void initializeAdaptive(vector_type &particles,
+    void initializeAdaptive(vector_type &particlesFrom, 
+                            vector_type2 &particlesTo,
                             unsigned int convergenceOrder,
                             T rCut) {
-        SupportBuilder<vector_type>
-                supportBuilder(particles, differentialSignature, rCut);
+        SupportBuilder<vector_type,vector_type2>
+                supportBuilder(particlesFrom, particlesTo, differentialSignature, rCut, differentialOrder == 0);
         unsigned int requiredSupportSize = monomialBasis.size();
-        localSupports.resize(particles.size_local_orig());
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
-        kerOffsets.resize(particles.size_local_orig());
+        if (!isSharedLocalSupport)
+            localSupports.resize(particlesTo.size_local_orig());
+        localEps.resize(particlesTo.size_local_orig());
+        localEpsInvPow.resize(particlesTo.size_local_orig());
+        kerOffsets.resize(particlesTo.size_local_orig());
-        auto it = particles.getDomainIterator();
+        auto it = particlesTo.getDomainIterator();
         while (it.isNext()) {
             const T condVTOL = 1e2;
+            auto key_o = particlesTo.getOriginKey(it.get());
+            if (!isSharedLocalSupport)
+                localSupports.get(key_o.getKey()) = supportBuilder.getSupport(it, requiredSupportSize,opt);
+            Support& support = localSupports.get(key_o.getKey());
             // Get the points in the support of the DCPSE kernel and store the support for reuse
-            Support support = supportBuilder.getSupport(it, requiredSupportSize,opt);
             EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
             // Vandermonde matrix computation
             Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-                    vandermonde(support, monomialBasis, particles);
+                    vandermonde(support, monomialBasis,particlesFrom, particlesTo,HOverEpsilon);
-            T condV = conditionNumber(V, condVTOL);
             T eps = vandermonde.getEps();
-            if (condV > condVTOL) {
-                requiredSupportSize *= 2;
-                std::cout
-                        << "INFO: Increasing, requiredSupportSize = " << requiredSupportSize
-                        << std::endl; // debug
-                continue;
-            } else {
-                requiredSupportSize = monomialBasis.size();
+            if (!isSharedLocalSupport) {
+                T condV = conditionNumber(V, condVTOL);
+                if (condV > condVTOL) {
+                    requiredSupportSize *= 2;
+                    std::cout
+                            << "INFO: Increasing, requiredSupportSize = " << requiredSupportSize
+                            << std::endl; // debug
+                    continue;
+                } else
+                    requiredSupportSize = monomialBasis.size();
-            auto key_o = particles.getOriginKey(it.get());
-            localSupports[key_o.getKey()] = support;
-            localEps[key_o.getKey()] = eps;
-            localEpsInvPow[key_o.getKey()] = 1.0 / openfpm::math::intpowlog(eps,differentialOrder);
+            localSupports.get(key_o.getKey()) = support;
+            localEps.get(key_o.getKey()) = eps;
+            localEpsInvPow.get(key_o.getKey()) = 1.0 / openfpm::math::intpowlog(eps,differentialOrder);
             // Compute the diagonal matrix E
             DcpseDiagonalScalingMatrix<dim> diagonalScalingMatrix(monomialBasis);
             EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> E(support.size(), support.size());
-            diagonalScalingMatrix.buildMatrix(E, support, eps,particles);
+            diagonalScalingMatrix.buildMatrix(E, support, eps, particlesFrom, particlesTo);
             // Compute intermediate matrix B
             EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
             // Compute matrix A
@@ -571,11 +833,14 @@ private:
             // ...and store the solution for later reuse
             kerOffsets.get(key_o.getKey()) = calcKernels.size();
-            Point<dim, T> xp = particles.getPosOrig(key_o);
+            Point<dim, T> xp = particlesTo.getPosOrig(key_o);
-            for (auto &xqK : support.getKeys())
+            const auto& support_keys = support.getKeys();
+            size_t N = support_keys.size();
+            for (size_t i = 0; i < N; ++i)
-                Point<dim, T> xq = particles.getPosOrig(xqK);
+                const auto& xqK = support_keys.get(i);
+                Point<dim, T> xq = particlesFrom.getPosOrig(xqK);
                 Point<dim, T> normalizedArg = (xp - xq) / eps;
                 calcKernels.add(computeKernel(normalizedArg, a));
@@ -586,47 +851,71 @@ private:
-    void initializeStaticSize(vector_type &particles,
+    void initializeStaticSize(vector_type &particlesFrom,vector_type2 &particlesTo,
                               unsigned int convergenceOrder,
                               T rCut,
                               T supportSizeFactor) {
 #ifdef SE_CLASS1
-        this->update_ctr=particles.getMapCtr();
+        this->update_ctr=particlesFrom.getMapCtr();
-        SupportBuilder<vector_type>
-                supportBuilder(particles, differentialSignature, rCut);
+        auto & v_cl=create_vcluster();
+        if(this->opt==LOAD){
+            if(v_cl.rank()==0)
+            {std::cout<<"Warning: Creating empty DC-PSE operator! Please use update or load to get kernels."<<std::endl;}
+            return;
+        }
+        SupportBuilder<vector_type,vector_type2>
+                supportBuilder(particlesFrom,particlesTo, differentialSignature, rCut, differentialOrder == 0);
         unsigned int requiredSupportSize = monomialBasis.size() * supportSizeFactor;
+        supportBuilder.setAdapFac(AdapFac);
-        localSupports.resize(particles.size_local_orig());
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
-        kerOffsets.resize(particles.size_local_orig());
-        auto it = particles.getDomainIterator();
+        if (!isSharedLocalSupport)
+            localSupports.resize(particlesTo.size_local_orig());
+        localEps.resize(particlesTo.size_local_orig());
+        localEpsInvPow.resize(particlesTo.size_local_orig());
+        kerOffsets.resize(particlesTo.size_local_orig());
+        kerOffsets.fill(-1);
+        T avgSpacingGlobal=0,avgSpacingGlobal2=0,maxSpacingGlobal=0,minSpacingGlobal=std::numeric_limits<T>::max();
+        size_t Counter=0;
+        auto it = particlesTo.getDomainIterator();
         while (it.isNext()) {
             // Get the points in the support of the DCPSE kernel and store the support for reuse
-            Support support = supportBuilder.getSupport(it, requiredSupportSize,opt);
-            EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
+            auto key_o = particlesTo.getOriginKey(it.get());
+            if (!isSharedLocalSupport)
+                localSupports.get(key_o.getKey()) = supportBuilder.getSupport(it, requiredSupportSize,opt);
-            auto key_o = particles.getOriginKey(it.get());
+            Support& support = localSupports.get(key_o.getKey());
+            EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
             // Vandermonde matrix computation
             Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-                    vandermonde(support, monomialBasis,particles);
+                    vandermonde(support, monomialBasis,particlesFrom,particlesTo,HOverEpsilon);
             T eps = vandermonde.getEps();
+            avgSpacingGlobal+=eps;
+            T tSpacing = vandermonde.getMinSpacing();
+            avgSpacingGlobal2+=tSpacing;
+            if(tSpacing>maxSpacingGlobal)
+            {
+                maxSpacingGlobal=tSpacing;
+            }
+            if(tSpacing<minSpacingGlobal)
+            {
+                minSpacingGlobal=tSpacing;
+            }
-            localSupports[key_o.getKey()] = support;
-            localEps[key_o.getKey()] = eps;
-            localEpsInvPow[key_o.getKey()] = 1.0 / openfpm::math::intpowlog(eps,differentialOrder);
+            localEps.get(key_o.getKey()) = eps;
+            localEpsInvPow.get(key_o.getKey()) = 1.0 / openfpm::math::intpowlog(eps,differentialOrder);
             // Compute the diagonal matrix E
             DcpseDiagonalScalingMatrix<dim> diagonalScalingMatrix(monomialBasis);
             EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> E(support.size(), support.size());
-            diagonalScalingMatrix.buildMatrix(E, support, eps, particles);
+            diagonalScalingMatrix.buildMatrix(E, support, eps, particlesFrom, particlesTo);
             // Compute intermediate matrix B
             EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
             // Compute matrix A
@@ -643,28 +932,42 @@ private:
             // ...and store the solution for later reuse
             kerOffsets.get(key_o.getKey()) = calcKernels.size();
-            Point<dim, T> xp = particles.getPosOrig(key_o);
+            Point<dim, T> xp = particlesTo.getPosOrig(key_o);
-            for (auto &xqK : support.getKeys())
+            const auto& support_keys = support.getKeys();
+            size_t N = support_keys.size();
+            for (size_t i = 0; i < N; ++i)
-                Point<dim, T> xq = particles.getPosOrig(xqK);
+                const auto& xqK = support_keys.get(i);
+                Point<dim, T> xq = particlesFrom.getPosOrig(xqK);
                 Point<dim, T> normalizedArg = (xp - xq) / eps;
                 calcKernels.add(computeKernel(normalizedArg, a));
+            ++Counter;
-    }
+        v_cl.sum(avgSpacingGlobal);
+        v_cl.sum(avgSpacingGlobal2);
+        v_cl.max(maxSpacingGlobal);
+        v_cl.min(minSpacingGlobal);
+        v_cl.sum(Counter);
+        v_cl.execute();
+        if(v_cl.rank()==0)
+        {std::cout<<"DCPSE Operator Construction Complete. The global avg spacing in the support <h> is: "<<HOverEpsilon*avgSpacingGlobal/(T(Counter))<<" (c="<<HOverEpsilon<<"). Avg:"<<avgSpacingGlobal2/(T(Counter))<<" Range:["<<minSpacingGlobal<<","<<maxSpacingGlobal<<"]."<<std::endl;}
+    }
     T computeKernel(Point<dim, T> x, EMatrix<T, Eigen::Dynamic, 1> & a) const {
         T res = 0;
         unsigned int counter = 0;
         T expFactor = exp(-norm2(x));
-        for (const Monomial<dim> &m : monomialBasis.getElements()) {
+        size_t N = monomialBasis.getElements().size();
+        for (size_t i = 0; i < N; ++i)
+        {
+            const Monomial<dim> &m = monomialBasis.getElement(i);
             T coeff = a(counter);
             T mbValue = m.evaluate(x);
             res += coeff * mbValue * expFactor;
@@ -690,7 +993,6 @@ private:
diff --git a/src/DCPSE/DcpseDiagonalScalingMatrix.hpp b/src/DCPSE/DcpseDiagonalScalingMatrix.hpp
index ef1dbd6f..5fa97b08 100644
--- a/src/DCPSE/DcpseDiagonalScalingMatrix.hpp
+++ b/src/DCPSE/DcpseDiagonalScalingMatrix.hpp
@@ -1,5 +1,6 @@
 // Created by tommaso on 29/03/19.
+// Modified by Serhii
@@ -8,39 +9,39 @@
 #include "MonomialBasis.hpp"
 #include "Support.hpp"
-template <unsigned int dim>
+template <unsigned int dim, typename monomialBasis_type = MonomialBasis<dim>>
 class DcpseDiagonalScalingMatrix
-    const MonomialBasis<dim> monomialBasis;
+    const monomialBasis_type& monomialBasis;
+    DcpseDiagonalScalingMatrix(const monomialBasis_type &monomialBasis) : monomialBasis(monomialBasis) {}
-    DcpseDiagonalScalingMatrix(const MonomialBasis<dim> &monomialBasis) : monomialBasis(monomialBasis) {}
-    template <typename T, typename MatrixType, typename vector_type>
-    void buildMatrix(MatrixType &M, Support support, T eps, vector_type & particles)
+    template <typename T, typename MatrixType, typename vector_type, typename vector_type2>
+    void buildMatrix(MatrixType &M, Support support, T eps, vector_type & particlesFrom , vector_type2 & particlesTo)
         // Check that all the dimension constraints are met
         assert(support.size() >= monomialBasis.size());
         assert(M.rows() == support.size());
         assert(M.cols() == support.size());
-        Point<dim,typename vector_type::stype> ref_p = particles.getPosOrig(support.getReferencePointKey());
+        Point<dim,typename vector_type::stype> ref_p = particlesTo.getPosOrig(support.getReferencePointKey());
         // Fill the diagonal matrix
         M.setZero(); // Make sure the rest of the matrix is zero!
-        int i = 0;
-        for (const auto& pt : support.getKeys())
+        const auto& support_keys = support.getKeys();
+        size_t N = support_keys.size();
+        for (size_t i = 0; i < N; ++i)
+            const auto& pt = support_keys.get(i);
         	Point<dim,typename vector_type::stype> p = ref_p;
-        	p -= particles.getPosOrig(pt);
+        	p -= particlesFrom.getPosOrig(pt);
             M(i,i) = exp(- norm2(p) / (2.0 * eps * eps));
-            ++i;
diff --git a/src/DCPSE/DcpseInterpolation.hpp b/src/DCPSE/DcpseInterpolation.hpp
new file mode 100644
index 00000000..50c01ae2
--- /dev/null
+++ b/src/DCPSE/DcpseInterpolation.hpp
@@ -0,0 +1,107 @@
+// Created by Abhinav Singh on 03.11.21.
+#include "DCPSE/Dcpse.hpp"
+/*! \brief Class for Creating the DCPSE Operator For the function approximation objects and computes DCPSE Kernels.
+ *
+ *
+ * \param parts particle set
+ * \param ord order of convergence of the operator
+ * \param rCut Argument for cell list construction
+ * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+ * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+ *
+ * \return Operator Dx which is a function on Vector_dist_Expressions
+ *
+ */
+template<typename particlesFrom_type, typename particlesTo_type>
+class PPInterpolation 
+    void *dcpse;
+    particlesFrom_type & particlesFrom;
+    particlesTo_type & particlesTo;
+    /*! \brief Constructor for Creating the DCPSE Operator Dx and objects and computes DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     * \param ord order of convergence of the operator
+     * \param rCut Argument for cell list construction
+     * \param oversampling_factor multiplier to the minimum no. of particles required by the operator in support
+     * \param support_options default:N_particles, Radius can be used to select all particles inside rCut. Overrides oversampling.
+     *
+     * \return Operator F which is a function on Vector_dist_Expressions
+     *
+     */
+    PPInterpolation(particlesFrom_type &particlesFrom,particlesTo_type &particlesTo, unsigned int ord, typename particlesFrom_type::stype rCut,
+                      double oversampling_factor = dcpse_oversampling_factor,
+                      support_options opt = support_options::RADIUS)
+    :particlesFrom(particlesFrom),particlesTo(particlesTo)
+    {
+        Point<particlesFrom_type::dims, unsigned int> p;
+        dcpse = new Dcpse<particlesFrom_type::dims, particlesFrom_type,particlesTo_type>(particlesFrom,particlesTo, p, ord, rCut, oversampling_factor, opt);
+    }
+    void deallocate() {
+        delete (Dcpse<particlesFrom_type::dims, particlesFrom_type, particlesTo_type> *) dcpse;
+    }
+   /* template<typename operand_type>
+    vector_dist_expression_op<operand_type, Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype>, VECT_DCPSE>
+    operator()(operand_type arg) {
+        typedef Dcpse_type<operand_type::vtype::dims, typename operand_type::vtype> dcpse_type;
+        return vector_dist_expression_op<operand_type, dcpse_type, VECT_DCPSE>(arg, *(dcpse_type *) dcpse);
+    }*/
+   template<unsigned int prp1,unsigned int prp2>
+   void p2p() {
+       auto dcpse_temp = (Dcpse<particlesFrom_type::dims, particlesFrom_type, particlesTo_type>*) dcpse;
+       dcpse_temp->template p2p<prp1,prp2>();
+   }
+    // template<unsigned int prp, typename particles_type>
+    // void DrawKernel(particles_type &particles, int k) {
+    //     auto dcpse_temp = (Dcpse_type<particlesFrom_type::dims, particlesFrom_type, particlesTo_type> *) dcpse;
+    //     dcpse_temp->template DrawKernel<prp>(particles, k);
+    // }
+    // template<unsigned int prp, typename particles_type>
+    // void DrawKernelNN(particles_type &particles, int k) {
+    //     auto dcpse_temp = (Dcpse_type<particlesFrom_type::dims, particlesFrom_type,particlesTo_type> *) dcpse;
+    //     dcpse_temp->template DrawKernelNN<prp>(particles, k);
+    // }
+    // template<typename particles_type>
+    // void checkMomenta(particles_type &particles) {
+    //     auto dcpse_temp = (Dcpse_type<particles_type::dims, particlesFrom_type, particlesTo_type> *) dcpse;
+    //     dcpse_temp->checkMomenta(particles);
+    // }
+    /*! \brief Method for Updating the DCPSE Operator by recomputing DCPSE Kernels.
+     *
+     *
+     * \param parts particle set
+     */
+    void update() {
+        auto dcpse_temp = (Dcpse<particlesFrom_type::dims, particlesFrom_type, particlesTo_type> *) dcpse;
+        dcpse_temp->initializeUpdate(particlesFrom,particlesTo);
+    }
diff --git a/src/DCPSE/Monomial.cuh b/src/DCPSE/Monomial.cuh
new file mode 100644
index 00000000..f3b892e5
--- /dev/null
+++ b/src/DCPSE/Monomial.cuh
@@ -0,0 +1,204 @@
+// Created by Serhii
+#include "Space/Shape/Point.hpp"
+template<unsigned int dim>
+class Monomial_gpu
+    unsigned int sum = 0;
+    unsigned int exponents[dim];
+    unsigned int scalar = 1;
+    __host__ __device__ Monomial_gpu();
+    __host__ __device__ Monomial_gpu(const Monomial_gpu<dim> &other);
+    __host__ __device__ Monomial_gpu(const Monomial<dim> &other);
+    __host__ __device__ explicit Monomial_gpu(const Point<dim, unsigned int> &other, unsigned int scalar = 1);
+    __host__ __device__ explicit Monomial_gpu(const Point<dim, long int> &other, unsigned int scalar = 1);
+    __host__ __device__ explicit Monomial_gpu(const unsigned int other[dim]);
+    __host__ __device__ Monomial_gpu<dim> &operator=(const Monomial_gpu<dim> &other);
+    __host__ __device__ Monomial_gpu<dim> &operator=(const Monomial<dim> &other);
+    __host__ __device__ bool operator==(const Monomial_gpu<dim> &other) const;
+    __host__ __device__ void swap(const Monomial_gpu<dim> &other);
+    __host__ __device__ unsigned int order() const;
+    __host__ __device__ unsigned int getExponent(unsigned int i) const;
+    __host__ __device__ void setExponent(unsigned int i, unsigned int value);
+    __host__ __device__ Monomial_gpu<dim> getDerivative(const Point<dim, unsigned int> differentialOrder) const;
+    __host__ __device__ unsigned int getScalar() const { return scalar; }
+    template<typename T> __host__ __device__ T evaluate(const Point<dim, T> x) const;
+    template<typename T> __host__ __device__ T evaluate(const T (&x)[dim]) const;
+    __host__ __device__ void updateSum();
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim>::Monomial_gpu()
+    for (size_t i = 0; i < dim; ++i) exponents[i] = 0;
+    sum = 0;
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim>::Monomial_gpu(const Point<dim, unsigned int> &other, unsigned int scalar) : scalar(scalar)
+    for (size_t i = 0; i < other.nvals; ++i)
+        exponents[i] = other.value(i);
+    updateSum();
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim>::Monomial_gpu(const Point<dim, long int> &other, unsigned int scalar) : scalar(scalar)
+    for (size_t i = 0; i < other.nvals; ++i)
+        exponents[i] = other.value(i);
+    updateSum();
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim>::Monomial_gpu(const unsigned int other[dim]) : Monomial_gpu(Point<dim, unsigned int>(other))
+    for (size_t i = 0; i < dim; ++i)
+       exponents[i] = other[i];
+    updateSum();
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim>::Monomial_gpu(const Monomial_gpu<dim> &other)
+        : sum(other.sum), scalar(other.scalar) 
+    for (size_t i = 0; i < dim; ++i)
+       exponents[i] = other.exponents[i];
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim>::Monomial_gpu(const Monomial<dim> &other)
+        : sum(other.order()), scalar(other.getScalar())
+    for (size_t i = 0; i < dim; ++i)
+       exponents[i] = other.getExponent(i);
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim> &Monomial_gpu<dim>::operator=(const Monomial_gpu<dim> &other)
+    for (size_t i = 0; i < dim; ++i)
+       exponents[i] = other.exponents[i];
+    sum = other.sum;
+    scalar = other.scalar;
+    return *this;
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim> &Monomial_gpu<dim>::operator=(const Monomial<dim> &other)
+    for (size_t i = 0; i < dim; ++i)
+       exponents[i] = other.getExponent(i);
+    sum = other.order();
+    scalar = other.getScalar();
+    return *this;
+template<unsigned int dim>
+__host__ __device__ void Monomial_gpu<dim>::updateSum()
+    sum = 0;
+    for (unsigned int i = 0; i < dim; ++i)
+        sum += exponents[i];
+template<unsigned int dim>
+__host__ __device__ unsigned int Monomial_gpu<dim>::order() const
+    return sum;
+template<unsigned int dim>
+__host__ __device__ unsigned int Monomial_gpu<dim>::getExponent(unsigned int i) const
+    return exponents[i];
+template<unsigned int dim>
+__host__ __device__ void Monomial_gpu<dim>::setExponent(unsigned int i, unsigned int value)
+    exponents[i] = value;
+    updateSum();
+template<unsigned int dim>
+__host__ __device__ bool Monomial_gpu<dim>::operator==
+        (const Monomial_gpu<dim> &other) const
+    bool EQ = true;
+    for (size_t i = 0; i < dim; ++i)
+        if (exponents[i] != other[i])
+            EQ = false;
+    return EQ && (scalar == other.scalar);
+template<unsigned int dim>
+template<typename T>
+__host__ __device__ T Monomial_gpu<dim>::evaluate(const Point<dim, T> x) const
+    T res = scalar;
+    for (unsigned int i = 0; i < dim; ++i)
+        res *= pow(x[i], getExponent(i));
+    return res;
+template<unsigned int dim>
+template<typename T>
+__host__ __device__ T Monomial_gpu<dim>::evaluate(const T (& x) [dim]) const
+    T res = scalar;
+    for (unsigned int i = 0; i < dim; ++i)
+        res *= pow(x[i], getExponent(i));
+    return res;
+template<unsigned int dim>
+__host__ __device__ Monomial_gpu<dim> Monomial_gpu<dim>::getDerivative(const Point<dim, unsigned int> differentialOrder) const
+    unsigned int s = scalar;
+    Point<dim, unsigned int> e(exponents);
+    for (unsigned int i = 0; i < dim; ++i)
+    {
+        unsigned int origExp = e.value(i);
+        int targetExp = static_cast<int>(origExp) - static_cast<int>(differentialOrder.value(i));
+        for (int k = origExp; k > targetExp && k >= 0; --k)
+        {
+            s *= k;
+        }
+        e.get(i) = static_cast<unsigned int>((targetExp < 0) ? 0 : targetExp);
+    }
+    return Monomial_gpu(e, s);
+template<unsigned int dim>
+__host__ __device__ void Monomial_gpu<dim>::swap(const Monomial_gpu<dim> &other)
+    sum = other.sum;
+    scalar = other.scalar;
+    for (size_t i = 0; i < dim; ++i)
+       exponents[i] = other.exponents[i];
diff --git a/src/DCPSE/Monomial.hpp b/src/DCPSE/Monomial.hpp
index 6d9b19ff..11370a5d 100644
--- a/src/DCPSE/Monomial.hpp
+++ b/src/DCPSE/Monomial.hpp
@@ -55,6 +55,7 @@ public:
         return lhs << rhs.scalar << " : " << rhs.exponents.toString();
+    __host__ __device__ unsigned int getScalar() const { return scalar; }
     void updateSum();
diff --git a/src/DCPSE/MonomialBasis.hpp b/src/DCPSE/MonomialBasis.hpp
index c9390a9d..381b975a 100644
--- a/src/DCPSE/MonomialBasis.hpp
+++ b/src/DCPSE/MonomialBasis.hpp
@@ -5,45 +5,51 @@
-#include <vector>
+#include "Vector/map_vector.hpp"
 #include <Grid/grid_sm.hpp>
 #include <Grid/iterators/grid_key_dx_iterator_sub_bc.hpp>
 #include "Monomial.hpp"
+#include "Monomial.cuh"
-template<unsigned int dim>
+template<unsigned int dim, typename T = Monomial<dim>, template<typename, template<typename...> class...> class vector_type = openfpm::vector_std, template<typename...> class... Args>
 class MonomialBasis
-    std::vector<Monomial<dim>> basis;
+    vector_type<T, Args...> basis;
-    MonomialBasis(const std::vector<unsigned int> &degrees, unsigned int convergenceOrder);
+    MonomialBasis() {}
+    MonomialBasis(const vector_type<unsigned int, Args...> &degrees, unsigned int convergenceOrder);
     MonomialBasis(unsigned int degrees[dim], unsigned int convergenceOrder);
 //    explicit MonomialBasis(Point<dim, unsigned int> degrees, unsigned int convergenceOrder);
-    explicit MonomialBasis(const std::vector<Monomial<dim>> &basis) : basis(basis) {}
+    __host__ __device__ explicit MonomialBasis(const vector_type<T, Args...> &basis) : basis(basis) {}
-    MonomialBasis(const MonomialBasis &other);
+    __host__ __device__ MonomialBasis(const MonomialBasis &other);
-    MonomialBasis &operator=(const MonomialBasis &other);
+    __host__ __device__ MonomialBasis &operator=(const MonomialBasis &other);
-    unsigned int size() const;
+    __host__ __device__ unsigned int size() const;
-    const Monomial<dim> &getElement(unsigned int i) const;
+    __host__ __device__ const T &getElement(size_t i) const;
-    Monomial<dim> &getElement(unsigned int i);
+    __host__ __device__ T &getElement(size_t i);
-    const std::vector<Monomial<dim>> &getElements() const;
+    __host__ __device__ const vector_type<T, Args...> &getElements() const;
-    MonomialBasis<dim> getDerivative(Point<dim, unsigned int> differentialOrder) const;
+    __host__ __device__ MonomialBasis<dim, T, vector_type, Args...> getDerivative(Point<dim, unsigned int> differentialOrder) const;
-    bool operator==(const MonomialBasis &other) const;
+    __host__ __device__ bool operator==(const MonomialBasis &other) const;
+    __host__ __device__ vector_type<T, Args...>& getBasis() { return basis; }
     template<typename charT, typename traits>
     friend std::basic_ostream<charT, traits> &
-    operator<<(std::basic_ostream<charT, traits> &lhs, MonomialBasis<dim> const &rhs)
+    operator<<(std::basic_ostream<charT, traits> &lhs, MonomialBasis<dim, T, vector_type, Args...> const &rhs)
         lhs << "MonomialBasis: size=" << rhs.size() << ", elements={ ";
         for (const auto &el : rhs.getElements())
@@ -55,58 +61,60 @@ public:
-    void generateBasis(std::vector<unsigned int> m, unsigned int r);
+    void generateBasis(vector_type<unsigned int, Args...> m, unsigned int r);
 //// Definitions below
-template<unsigned int dim>
-MonomialBasis<dim>::MonomialBasis(const std::vector<unsigned int> &degrees, unsigned int convergenceOrder)
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ MonomialBasis<dim, T, vector_type, Args...>::MonomialBasis(const vector_type<unsigned int, Args...> &degrees, unsigned int convergenceOrder)
     generateBasis(degrees, convergenceOrder);
-template<unsigned int dim>
-MonomialBasis<dim>::MonomialBasis(unsigned int *degrees, unsigned int convergenceOrder)
-        : MonomialBasis(std::vector<unsigned int>(degrees, degrees + dim), convergenceOrder) {}
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ MonomialBasis<dim, T, vector_type, Args...>::MonomialBasis(unsigned int *degrees, unsigned int convergenceOrder)
+        : MonomialBasis(vector_type<unsigned int, Args...>(degrees, degrees + dim), convergenceOrder) {}
-template<unsigned int dim>
-MonomialBasis<dim>::MonomialBasis(const MonomialBasis &other)
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ MonomialBasis<dim, T, vector_type, Args...>::MonomialBasis(const MonomialBasis &other)
-    basis = other.basis; // Here it works because both std::vector and Monomial perform a deep copy.
+    basis = other.basis; // Here it works because both vector_type and Monomial perform a deep copy.
-template<unsigned int dim>
-MonomialBasis<dim> &MonomialBasis<dim>::operator=(const MonomialBasis &other)
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ MonomialBasis<dim, T, vector_type, Args...> &MonomialBasis<dim, T, vector_type, Args...>::operator=(const MonomialBasis &other)
-    basis = other.basis; // Here it works because both std::vector and Monomial perform a deep copy.
+    basis = other.basis; // Here it works because both vector_type and Monomial perform a deep copy.
     return *this;
-template<unsigned int dim>
-unsigned int MonomialBasis<dim>::size() const
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ unsigned int MonomialBasis<dim, T, vector_type, Args...>::size() const
     return basis.size();
-template<unsigned int dim>
-const Monomial<dim> &MonomialBasis<dim>::getElement(unsigned int i) const
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ const T &MonomialBasis<dim, T, vector_type, Args...>::getElement(size_t i) const
-    return basis[i];
+    return basis.get(i);
-template<unsigned int dim>
-Monomial<dim> &MonomialBasis<dim>::getElement(unsigned int i)
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ T &MonomialBasis<dim, T, vector_type, Args...>::getElement(size_t i)
-    return basis[i];
+    return basis.get(i);
-template<unsigned int dim>
-void MonomialBasis<dim>::generateBasis(std::vector<unsigned int> m, unsigned int r)
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+void MonomialBasis<dim, T, vector_type, Args...>::generateBasis(vector_type<unsigned int, Args...> m, unsigned int r)
     // Compute the vector of actual dimensions to iterate over
     // NOTE: each index can go up to sum(m)+r
-    unsigned int mSum = std::accumulate(m.begin(), m.end(), 0U);
+    unsigned int mSum = 0U;
+    for (size_t i = 0; i < m.size(); ++i) mSum += m.get(i);
     unsigned int orderLimit = mSum + r;
     size_t dimensions[dim];
     std::fill(dimensions, dimensions + dim, orderLimit);
@@ -125,47 +133,55 @@ void MonomialBasis<dim>::generateBasis(std::vector<unsigned int> m, unsigned int
     // Finally compute alpha_min
     unsigned char alphaMin = static_cast<unsigned char>(!(mSum % 2)); // if mSum is even, alpha_min must be 1
+    if(mSum==0)
+    {
+        alphaMin = 0;
+    }
     //std::cout<<"AlphaMin: "<<alphaMin<<std::endl;
     //unsigned char alphaMin = 0; // we want to always have 1 in the basis
     while (it.isNext())
         Point<dim, long int> p = it.get().get_k();
-        Monomial<dim> candidateBasisElement(p);
+        T candidateBasisElement(p);
         // Filter out the elements which don't fullfil the theoretical condition for being in the vandermonde matrix
         if (candidateBasisElement.order() < orderLimit && candidateBasisElement.order() >= alphaMin)
-            basis.push_back(candidateBasisElement);
+            basis.add(candidateBasisElement);
-template<unsigned int dim>
-const std::vector<Monomial<dim>> &MonomialBasis<dim>::getElements() const
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ const vector_type<T, Args...> &MonomialBasis<dim, T, vector_type, Args...>::getElements() const
     return basis;
-template<unsigned int dim>
-MonomialBasis<dim> MonomialBasis<dim>::getDerivative(const Point<dim, unsigned int> differentialOrder) const
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ MonomialBasis<dim, T, vector_type, Args...> MonomialBasis<dim, T, vector_type, Args...>::getDerivative(const Point<dim, unsigned int> differentialOrder) const
-    std::vector<Monomial<dim>> derivatives;
-    for (const auto &monomial : getElements())
+    vector_type<T, Args...> derivatives;
+    for (size_t i = 0; i < basis.size(); ++i)
-        derivatives.push_back(monomial.getDerivative(differentialOrder));
+        // used insted of rhs ref as it does swap internally (not supported by Monomial)
+        T d = basis.get(i).getDerivative(differentialOrder);
+        derivatives.add(d);
-    return MonomialBasis<dim>(derivatives);
+    return MonomialBasis<dim, T, vector_type, Args...>(derivatives);
-template<unsigned int dim>
-bool MonomialBasis<dim>::operator==(const MonomialBasis &other) const
+template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+__host__ __device__ bool MonomialBasis<dim, T, vector_type, Args...>::operator==(const MonomialBasis &other) const
     return basis == other.basis;
-//template<unsigned int dim>
-//MonomialBasis<dim>::MonomialBasis(Point<dim, unsigned int> degrees, unsigned int convergenceOrder)
+//template<unsigned int dim, typename T, template<typename, template<typename...> class...> class vector_type, template<typename...> class... Args>
+// __host__ __device__ //MonomialBasis<dim, T, vector_type, Args...>::MonomialBasis(Point<dim, unsigned int> degrees, unsigned int convergenceOrder)
 //        : MonomialBasis(degrees.asArray(), convergenceOrder) {}
diff --git a/src/DCPSE/Support.hpp b/src/DCPSE/Support.hpp
index bcb5bc66..49282ab8 100644
--- a/src/DCPSE/Support.hpp
+++ b/src/DCPSE/Support.hpp
@@ -18,17 +18,22 @@ class Support
     size_t referencePointKey;
-    std::vector<size_t> keys;
+    openfpm::vector_std<size_t> keys;
     Support() {};
-    Support(const size_t &referencePoint, const std::vector<size_t> &keys)
+    Support(const size_t &referencePoint, const openfpm::vector_std<size_t> &keys)
+    Support(const size_t &referencePoint, const std::vector<size_t> &keys)
+            :referencePointKey(referencePoint),
+              keys(keys.begin(), keys.end())
+              {}
     Support(const Support &other)
     : referencePointKey(other.referencePointKey),
@@ -44,10 +49,44 @@ public:
         return referencePointKey;
-    const std::vector<size_t> &getKeys() const
+    const openfpm::vector_std<size_t> &getKeys() const
 	    return keys;
+    openfpm::vector_std<size_t> &getKeys()
+	{
+	    return keys;
+	}
+    static bool pack()
+    {
+        return true;
+    }
+    static bool packRequest()
+    {
+        return true;
+    }
+    template<int ... prp> inline void packRequest(size_t & req) const
+    {
+        req += sizeof(size_t);
+        keys.packRequest(req);
+    }
+    template<int ... prp> inline void pack(ExtPreAlloc<HeapMemory> & mem, Pack_stat & sts) const
+    {
+        Packer<size_t,HeapMemory>::pack(mem,referencePointKey,sts);
+        keys.template pack<prp ...>(mem,sts);
+    }
+    template<unsigned int ... prp, typename MemType> inline void unpack(ExtPreAlloc<MemType> & mem, Unpack_stat & ps)
+    {
+        Unpacker<size_t,MemType>::unpack(mem,referencePointKey,ps);
+        keys.template unpack<prp ...>(mem,ps);
+    }
diff --git a/src/DCPSE/SupportBuilder.cuh b/src/DCPSE/SupportBuilder.cuh
new file mode 100644
index 00000000..363741b7
--- /dev/null
+++ b/src/DCPSE/SupportBuilder.cuh
@@ -0,0 +1,146 @@
+// Created by Serhii
+#include <Space/Shape/Point.hpp>
+#include <Vector/vector_dist.hpp>
+#include "Support.hpp"
+#include <utility>
+#include "SupportBuilder.hpp"
+template <unsigned int dim>
+__device__ __host__ bool nextCell(size_t (&offset)[dim], size_t maxOffset) {
+    size_t i = 0;
+    while (i < dim) {
+        if ((++offset[i++])/maxOffset)
+            for (size_t j = 0; j < i; ++j)
+                offset[j] = 0;
+        else
+            return true;
+    }
+    return false;
+template<unsigned int dim, typename T, typename particles_type, typename CellList_type, typename supportSize_type>
+__global__ void gatherSupportSize_gpu(
+    particles_type particles, CellList_type cl, supportSize_type supportSize, T rCut) {
+    auto p_key = GET_PARTICLE(particles);
+    Point<dim, T> pos = particles.getPos(p_key);
+    auto cell = cl.getCellGrid(pos);
+    size_t grSize[dim]; cl.getGrid().getSize(grSize);
+    size_t offset[dim]; for (int i = 0; i < dim; ++i) offset[i] = 0;    
+    grid_key_dx<dim> middle; for (int i = 0; i < dim; ++i) middle.set_d(i,1);     
+    size_t N = 0;
+    do {
+        auto key=grid_key_dx<dim>(offset); key=cell+key-middle;
+        for (size_t i = 0; i < dim; ++i)
+            if (key.value(i) < 0 || key.value(i) >= grSize[i])
+                continue;
+        mem_id id = cl.getGrid().LinId(key);
+        const size_t cellLinId = static_cast<size_t>(id);
+        const size_t elemsInCell = cl.getNelements(cellLinId);
+        for (size_t k = 0; k < elemsInCell; ++k) {
+            size_t el = cl.get(cellLinId, k);
+            if (p_key == el) continue;
+            if (pos.distance(particles.getPosOrig(el)) < rCut) ++N;
+        }
+    } while (nextCell<dim>(offset, 2+1));
+    supportSize.get(p_key) = N;
+template<unsigned int dim, typename T, typename particles_type, typename CellList_type, typename supportKey_type>
+__global__ void assembleSupport_gpu(particles_type particles, CellList_type cl, supportKey_type supportSize, supportKey_type supportKeys1D, T rCut) {
+    auto p_key = GET_PARTICLE(particles);
+    Point<dim, T> pos = particles.getPos(p_key);
+    auto cell = cl.getCellGrid(pos);
+    size_t  supportKeysSize = supportSize.get(p_key+1)-supportSize.get(p_key);
+    size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[supportSize.get(p_key)];
+    size_t grSize[dim]; cl.getGrid().getSize(grSize);
+    size_t offset[dim]; for (int i = 0; i < dim; ++i) offset[i] = 0;    
+    grid_key_dx<dim> middle; for (int i = 0; i < dim; ++i) middle.set_d(i,1);     
+    size_t N = 0;
+    do {
+        auto key=grid_key_dx<dim>(offset); key=cell+key-middle;
+        for (size_t i = 0; i < dim; ++i)
+            if (key.value(i) < 0 || key.value(i) >= grSize[i])
+                continue;
+        mem_id id = cl.getGrid().LinId(key);
+        const size_t cellLinId = static_cast<size_t>(id);
+        const size_t elemsInCell = cl.getNelements(cellLinId);
+        for (size_t k = 0; k < elemsInCell; ++k) {
+            size_t el = cl.get(cellLinId, k);
+            if (p_key == el) continue;
+            if (pos.distance(particles.getPosOrig(el)) < rCut) supportKeys[N++] = el;
+        }
+    } while (nextCell<dim>(offset, 2+1));
+template<typename vector_type>
+class SupportBuilderGPU
+    vector_type &domain;
+    typename vector_type::stype rCut;
+    SupportBuilderGPU(vector_type &domain, typename vector_type::stype rCut)
+        : domain(domain), rCut(rCut) {}
+    void getSupport(size_t N, openfpm::vector_custd<size_t>& kerOffsets, openfpm::vector_custd<size_t>& supportKeys1D, 
+        size_t& maxSupport, size_t& supportKeysTotalN)
+    {
+        domain.hostToDevicePos();
+        auto it = domain.getDomainIteratorGPU(512);
+        typedef CellList_gen<vector_type::dims, typename vector_type::stype, Process_keys_lin, Mem_fast<CudaMemory>, shift<vector_type::dims, typename vector_type::stype>> params;
+        // auto NN = domain.getCellListGPU(rCut);
+        auto NN = domain.template getCellList<params>(rCut);
+        NN.hostToDevice();
+        // +1 to allow getting size from cumulative sum: "size[i+1] - size[i]"
+        kerOffsets.resize(N+1);
+        gatherSupportSize_gpu<vector_type::dims><<<it.wthr,it.thr>>>(domain.toKernel(), NN.toKernel(), kerOffsets.toKernel(), rCut);
+        kerOffsets.template deviceToHost();
+        supportKeysTotalN = 0; maxSupport = 0;
+        for (size_t i = 0; i < N; ++i) {   
+            size_t sz = kerOffsets.get(i);
+            kerOffsets.get(i) = supportKeysTotalN;
+            supportKeysTotalN += sz;
+            if (maxSupport < sz) maxSupport = sz;
+        }
+        kerOffsets.get(N) = supportKeysTotalN;
+        supportKeys1D.resize(supportKeysTotalN);
+        kerOffsets.template hostToDevice();
+        assembleSupport_gpu<vector_type::dims><<<it.wthr,it.thr>>>(domain.toKernel(), NN.toKernel(), kerOffsets.toKernel(), supportKeys1D.toKernel(), rCut);
+        supportKeys1D.template deviceToHost();
+    }
diff --git a/src/DCPSE/SupportBuilder.hpp b/src/DCPSE/SupportBuilder.hpp
index 7702d434..8d83f7f8 100644
--- a/src/DCPSE/SupportBuilder.hpp
+++ b/src/DCPSE/SupportBuilder.hpp
@@ -16,240 +16,232 @@
 enum support_options
+    RADIUS,
+    LOAD,
-template<typename vector_type>
-class SupportBuilder
+template<typename vector_type,typename vector_type2>
+class SupportBuilder {
-    vector_type &domain;
+    vector_type &domainFrom;
+    vector_type2 &domainTo;
     decltype(std::declval<vector_type>().getCellList(0.0)) cellList;
     const Point<vector_type::dims, unsigned int> differentialSignature;
-    typename vector_type::stype rCut;
+    typename vector_type::stype rCut, MinSpacing, AdapFac=1;
+    bool is_interpolation;
-    SupportBuilder(vector_type &domain, Point<vector_type::dims, unsigned int> differentialSignature, typename vector_type::stype rCut);
-    SupportBuilder(vector_type &domain, unsigned int differentialSignature[vector_type::dims], typename vector_type::stype rCut);
+    SupportBuilder(vector_type &domainFrom, vector_type2 &domainTo,
+                   const Point<vector_type::dims, unsigned int> differentialSignature,
+                   typename vector_type::stype rCut,
+                   bool is_interpolation)
+            : domainFrom(domainFrom),
+              domainTo(domainTo),
+              differentialSignature(differentialSignature),
+              rCut(rCut), is_interpolation(is_interpolation) {
+        cellList = domainFrom.getCellList(rCut);
+    }
+    SupportBuilder(vector_type &domainFrom, vector_type2 &domainTo,
+                   unsigned int differentialSignature[vector_type::dims], typename vector_type::stype rCut,
+                   bool is_interpolation)
+            : SupportBuilder(domainFrom, domainTo, Point<vector_type::dims, unsigned int>(differentialSignature),
+                             rCut) {}
     template<typename iterator_type>
-    Support getSupport(iterator_type itPoint, unsigned int requiredSize, support_options opt)
-    {
+    Support getSupport(iterator_type itPoint, unsigned int requiredSize, support_options opt) {
         // Get spatial position from point iterator
         vect_dist_key_dx p = itPoint.get();
         vect_dist_key_dx pOrig = itPoint.getOrig();
-        Point<vector_type::dims, typename vector_type::stype> pos = domain.getPos(p.getKey());
+        Point<vector_type::dims, typename vector_type::stype> pos = domainTo.getPos(p.getKey());
         // Get cell containing current point and add it to the set of cell keys
-        grid_key_dx<vector_type::dims> curCellKey = cellList.getCellGrid(pos); // Here get the key of the cell where the current point is
+        grid_key_dx<vector_type::dims> curCellKey = cellList.getCellGrid(
+                pos); // Here get the key of the cell where the current point is
         std::set<grid_key_dx<vector_type::dims>> supportCells;
         // Make sure to consider a set of cells providing enough points for the support
-        enlargeSetOfCellsUntilSize(supportCells, requiredSize + 1,opt); // NOTE: this +1 is because we then remove the point itself
+        enlargeSetOfCellsUntilSize(supportCells, requiredSize + 1,
+                                   opt); // NOTE: this +1 is because we then remove the point itself
         // Now return all the points from the support into a vector
-        std::vector<size_t> supportKeys = getPointsInSetOfCells(supportCells,p,pOrig,requiredSize,opt);
-        auto p_o = domain.getOriginKey(p.getKey());
-        std::remove(supportKeys.begin(), supportKeys.end(), p_o.getKey());
-        return Support(p_o.getKey(), supportKeys);
-    }
-    size_t getCellLinId(const grid_key_dx<vector_type::dims> &cellKey);
-    size_t getNumElementsInCell(const grid_key_dx<vector_type::dims> &cellKey);
+        std::vector<size_t> supportKeys = getPointsInSetOfCells(supportCells, p, pOrig, requiredSize, opt);
-    size_t getNumElementsInSetOfCells(const std::set<grid_key_dx<vector_type::dims>> &set);
-    void enlargeSetOfCellsUntilSize(std::set<grid_key_dx<vector_type::dims>> &set, unsigned int requiredSize,support_options opt);
-    std::vector<size_t> getPointsInSetOfCells(std::set<grid_key_dx<vector_type::dims>> set, vect_dist_key_dx & p,  vect_dist_key_dx & pOrig, size_t requiredSupportSize, support_options opt);
+        if (is_interpolation == false) {
+            auto p_o = domainFrom.getOriginKey(p.getKey());
+            std::remove(supportKeys.begin(), supportKeys.end(), p_o.getKey());
+        }
-    bool isCellKeyInBounds(grid_key_dx<vector_type::dims> key);
+        auto p_o = domainTo.getOriginKey(p.getKey());
+        return Support(p_o.getKey(), openfpm::vector_std<size_t>(supportKeys.begin(), supportKeys.end()));
+    }
-// Method definitions below
+    typename vector_type::stype getLastMinspacing() {
+        return this->MinSpacing;
+    }
-template<typename vector_type>
-SupportBuilder<vector_type>::SupportBuilder(vector_type &domain, const Point<vector_type::dims, unsigned int> differentialSignature,
-                                            typename vector_type::stype rCut)
- differentialSignature(differentialSignature),
- rCut(rCut)
-    cellList = domain.getCellList(rCut);
+    void setAdapFac(typename vector_type::stype fac) {
+        this->AdapFac=fac;
+    }
-template<typename vector_type>
-size_t SupportBuilder<vector_type>::getNumElementsInCell(const grid_key_dx<vector_type::dims> &cellKey)
-    const size_t curCellId = getCellLinId(cellKey);
-    size_t numElements = cellList.getNelements(curCellId);
-    return numElements;
+    size_t getCellLinId(const grid_key_dx<vector_type::dims> &cellKey) {
+        mem_id id = cellList.getGrid().LinId(cellKey);
+        return static_cast<size_t>(id);
+    }
-template<typename vector_type>
-size_t SupportBuilder<vector_type>::getNumElementsInSetOfCells(const std::set<grid_key_dx<vector_type::dims>> &set)
-    size_t tot = 0;
-    for (const auto cell : set)
-    {
-        tot += getNumElementsInCell(cell);
+    size_t getNumElementsInCell(const grid_key_dx<vector_type::dims> &cellKey) {
+        const size_t curCellId = getCellLinId(cellKey);
+        size_t numElements = cellList.getNelements(curCellId);
+        return numElements;
-    return tot;
-template<typename vector_type>
-void SupportBuilder<vector_type>::enlargeSetOfCellsUntilSize(std::set<grid_key_dx<vector_type::dims>> &set, unsigned int requiredSize,
-        support_options opt)
-    if (opt==support_options::RADIUS){
-        auto cell=*set.begin();
-        grid_key_dx<vector_type::dims> middle;
-        int n=std::ceil(rCut/cellList.getCellBox().getHigh(0));
-        size_t sz[vector_type::dims];
-        for (int i=0;i<vector_type::dims;i++)
-        {
-            sz[i]=2*n+1;
-            middle.set_d(i,n);
-        }
-        grid_sm<vector_type::dims,void> g(sz);
-        grid_key_dx_iterator<vector_type::dims> g_k(g);
-        while(g_k.isNext())
-        {
-            auto key=g_k.get();
-            key=cell+key-middle;
-            if (isCellKeyInBounds(key))
-            {
-                set.insert(key);
-            }
-            ++g_k;
+    size_t getNumElementsInSetOfCells(const std::set<grid_key_dx<vector_type::dims>> &set) {
+        size_t tot = 0;
+        for (const auto cell: set) {
+            tot += getNumElementsInCell(cell);
+        return tot;
-    else{
-        while (getNumElementsInSetOfCells(set) < 5.0*requiredSize) //Why 5*requiredSize? Becasue it can help with adaptive resolutions.
-        {
-            auto tmpSet = set;
-            for (const auto el : tmpSet)
+    void enlargeSetOfCellsUntilSize(std::set<grid_key_dx<vector_type::dims>> &set, unsigned int requiredSize,
+                                    support_options opt) {
+        if (opt == support_options::RADIUS || opt == support_options::ADAPTIVE) {
+            auto cell = *set.begin();
+            grid_key_dx<vector_type::dims> middle;
+            int n = std::ceil(rCut / cellList.getCellBox().getHigh(0));
+            size_t sz[vector_type::dims];
+            for (int i = 0; i < vector_type::dims; i++) {
+                sz[i] = 2 * n + 1;
+                middle.set_d(i, n);
+            }
+            grid_sm<vector_type::dims, void> g(sz);
+            grid_key_dx_iterator<vector_type::dims> g_k(g);
+            while (g_k.isNext()) {
+                auto key = g_k.get();
+                key = cell + key - middle;
+                if (isCellKeyInBounds(key)) {
+                    set.insert(key);
+                }
+                ++g_k;
+            }
+        } else {
+            while (getNumElementsInSetOfCells(set) <
+                   5.0 * requiredSize) //Why 5*requiredSize? Becasue it can help with adaptive resolutions.
-                for (unsigned int i = 0; i < vector_type::dims; ++i)
-                {
-                    const auto pOneEl = el.move(i, +1);
-                    const auto mOneEl = el.move(i, -1);
-                    if (isCellKeyInBounds(pOneEl))
-                    {
-                        set.insert(pOneEl);
-                    }
-                    if (isCellKeyInBounds(mOneEl))
-                    {
-                        set.insert(mOneEl);
+                auto tmpSet = set;
+                for (const auto el: tmpSet) {
+                    for (unsigned int i = 0; i < vector_type::dims; ++i) {
+                        const auto pOneEl = el.move(i, +1);
+                        const auto mOneEl = el.move(i, -1);
+                        if (isCellKeyInBounds(pOneEl)) {
+                            set.insert(pOneEl);
+                        }
+                        if (isCellKeyInBounds(mOneEl)) {
+                            set.insert(mOneEl);
+                        }
-            }
+            }
-template<typename vector_type>
-size_t SupportBuilder<vector_type>::getCellLinId(const grid_key_dx<vector_type::dims> &cellKey)
-    mem_id id = cellList.getGrid().LinId(cellKey);
-    return static_cast<size_t>(id);
-template<typename vector_type>
-std::vector<size_t> SupportBuilder<vector_type>::getPointsInSetOfCells(std::set<grid_key_dx<vector_type::dims>> set,
-																		vect_dist_key_dx & p,
-																		vect_dist_key_dx & pOrig,
-																		size_t requiredSupportSize,
-																		support_options opt)
-    struct reord
-    {
-        typename vector_type::stype dist;
-        size_t offset;
-        bool operator<(const reord & p) const
-        {return this->dist < p.dist;}
-    };
-    openfpm::vector<reord> rp;
-    std::vector<size_t> points;
-    Point<vector_type::dims,typename vector_type::stype> xp = domain.getPos(p);
-    for (const auto cellKey : set)
-    {
-        const size_t cellLinId = getCellLinId(cellKey);
-        const size_t elemsInCell = getNumElementsInCell(cellKey);
-        for (size_t k = 0; k < elemsInCell; ++k)
-        {
-            size_t el = cellList.get(cellLinId, k);
-            if (pOrig.getKey() == el)   {continue;}
-            Point<vector_type::dims,typename vector_type::stype> xq = domain.getPosOrig(el);
-            //points.push_back(el);
-            reord pr;
-            pr.dist = xp.distance(xq);
-            pr.offset = el;
-            rp.add(pr);
+    std::vector<size_t> getPointsInSetOfCells(std::set<grid_key_dx<vector_type::dims>> set,
+                                              vect_dist_key_dx &p,
+                                              vect_dist_key_dx &pOrig,
+                                              size_t requiredSupportSize,
+                                              support_options opt) {
+        struct reord {
+            typename vector_type::stype dist;
+            size_t offset;
+            bool operator<(const reord &p) const { return this->dist < p.dist; }
+        };
+        openfpm::vector<reord> rp;
+        std::vector<size_t> points;
+        Point<vector_type::dims, typename vector_type::stype> xp = domainTo.getPos(p);
+        for (const auto cellKey: set) {
+            const size_t cellLinId = getCellLinId(cellKey);
+            const size_t elemsInCell = getNumElementsInCell(cellKey);
+            for (size_t k = 0; k < elemsInCell; ++k) {
+                size_t el = cellList.get(cellLinId, k);
+                if (pOrig.getKey() == el && is_interpolation == false) { continue; }
+                Point<vector_type::dims, typename vector_type::stype> xq = domainFrom.getPosOrig(el);
+                //points.push_back(el);
+                reord pr;
+                pr.dist = xp.distance(xq);
+                pr.offset = el;
+                rp.add(pr);
+            }
-    }
-    if (opt == support_options::RADIUS)
-    {
-		for (int i = 0 ; i < rp.size() ; i++)
-		{
-			if (rp.get(i).dist < rCut)
-			{
-				points.push_back(rp.get(i).offset);
-			}
-		}
-/*        #ifdef SE_CLASS1
-		if (points.size()<requiredSupportSize)
-        {
-		    std::cerr<<__FILE__<<":"<<__LINE__<<"Note that the DCPSE neighbourhood doesn't have asked no. particles (Increase the rCut or reduce the over_sampling factor)";
-		    std::cout<<"Particels asked (minimum*oversampling_factor): "<<requiredSupportSize<<". Particles Possible with given options:"<<points.size()<<"."<<std::endl;
+        if (opt == support_options::RADIUS) {
+            for (int i = 0; i < rp.size(); i++) {
+                if (rp.get(i).dist < rCut) {
+                    points.push_back(rp.get(i).offset);
+                }
+            }
+            /*      #ifdef SE_CLASS1
+                    if (points.size()<requiredSupportSize)
+                    {
+                        std::cerr<<__FILE__<<":"<<__LINE__<<"Note that the DCPSE neighbourhood doesn't have asked no. particles (Increase the rCut or reduce the over_sampling factor)";
+                        std::cout<<"Particels asked (minimum*oversampling_factor): "<<requiredSupportSize<<". Particles Possible with given options:"<<points.size()<<"."<<std::endl;
+                    }
+                    #endif*/
-        #endif*/
-    }
-    else
-    {   rp.sort();
-        for (int i = 0 ; i < requiredSupportSize ; i++)
-		{
-			points.push_back(rp.get(i).offset);
-		}
-    }
-    return points;
+        else if(opt == support_options::ADAPTIVE) {
+            MinSpacing = std::numeric_limits<double>::max();
+            for (int i = 0; i < rp.size(); i++) {
+                if (MinSpacing > rp.get(i).dist && rp.get(i).dist != 0) {
+                    MinSpacing = rp.get(i).dist;
+                }
+            }
+#ifdef SE_CLASS1
+        assert(MinSpacing !=0 && "You have multiple particles on the same position.");
+            for (int i = 0; i < rp.size(); i++) {
+                if (rp.get(i).dist < AdapFac * MinSpacing) {
+                    points.push_back(rp.get(i).offset);
+                }
+            }
+        }
+        else {
+            rp.sort();
+            for (int i = 0; i < requiredSupportSize; i++) {
+                    points.push_back(rp.get(i).offset);
+                }
+            }
-template<typename vector_type>
-SupportBuilder<vector_type>::SupportBuilder(vector_type &domain, unsigned int *differentialSignature, typename vector_type::stype rCut)
-        : SupportBuilder(domain, Point<vector_type::dims, unsigned int>(differentialSignature), rCut) {}
+        //MinSpacing=MinSpacing/requiredSupportSize
+        return points;
+    }
-template<typename vector_type>
-bool SupportBuilder<vector_type>::isCellKeyInBounds(grid_key_dx<vector_type::dims> key)
-    const size_t *cellGridSize = cellList.getGrid().getSize();
-    for (size_t i = 0; i < vector_type::dims; ++i)
+    bool isCellKeyInBounds(grid_key_dx<vector_type::dims> key)
-        if (key.value(i) < 0 || key.value(i) >= cellGridSize[i])
+        const size_t *cellGridSize = cellList.getGrid().getSize();
+        for (size_t i = 0; i < vector_type::dims; ++i)
-            return false;
+            if (key.value(i) < 0 || key.value(i) >= cellGridSize[i])
+            {
+                return false;
+            }
+        return true;
-    return true;
diff --git a/src/DCPSE/Vandermonde.hpp b/src/DCPSE/Vandermonde.hpp
index f88f6224..c527200f 100644
--- a/src/DCPSE/Vandermonde.hpp
+++ b/src/DCPSE/Vandermonde.hpp
@@ -1,6 +1,6 @@
 // Created by tommaso on 21/03/19.
+// Edited by Abhinav Singh on 24/01/2022
@@ -14,22 +14,24 @@ class Vandermonde
     const Point<dim, T> point;
-    std::vector<Point<dim, T>> offsets;
+    openfpm::vector_std<Point<dim, T>> offsets;
     const MonomialBasis<dim> monomialBasis;
-    T eps;
+    T eps,HOverEpsilon,minSpacing;
 /*    Vandermonde(const Point<dim, T> &point, const std::vector<Point<dim, T>> &neighbours,
                 const MonomialBasis<dim> &monomialBasis);*/
-    template<typename vector_type>
+    template<typename vector_type,
+             typename vector_type2>
     Vandermonde(const Support &support,
                 const MonomialBasis<dim> &monomialBasis,
-                const vector_type & particles)
-    : point(particles.getPosOrig(support.getReferencePointKey())),
-                  monomialBasis(monomialBasis)
+                const vector_type & particlesFrom,
+                const vector_type2 & particlesTo,T HOverEpsilon=0.5)    //0.5 for the test
+    : point(particlesTo.getPosOrig(support.getReferencePointKey())),
+                  monomialBasis(monomialBasis),HOverEpsilon(HOverEpsilon)
-        initialize(support,particles);
+        initialize(support,particlesFrom,particlesTo);
@@ -38,8 +40,11 @@ public:
         // Build the Vandermonde matrix, row-by-row
         VandermondeRowBuilder<dim, T> vrb(monomialBasis);
         unsigned int row = 0;
-        for (auto &offset : offsets)
+        size_t N = offsets.size();
+        for (size_t i = 0; i < N; ++i)
+            const auto& offset = offsets.get(i);
             vrb.buildRow(M, row, offset, eps);
@@ -50,6 +55,10 @@ public:
         return eps;
+    T getMinSpacing()
+    {
+        return minSpacing;
+    }
@@ -57,12 +66,20 @@ private:
     void computeEps(T factor)
         T avgNeighbourSpacing = 0;
-        for (auto &offset : offsets)
+        minSpacing=std::numeric_limits<T>::max();
+        size_t N = offsets.size();
+        for (size_t i = 0; i < N; ++i)
+            const auto& offset = offsets.get(i);
+            double dist=norm(offset);
             avgNeighbourSpacing += computeAbsSum(offset);
+            if(minSpacing>dist)
+            {
+                minSpacing=dist;
+            }
         avgNeighbourSpacing /= offsets.size();
-        eps = factor * avgNeighbourSpacing;
+        eps = avgNeighbourSpacing/factor;
         assert(eps != 0);
@@ -76,16 +93,16 @@ private:
         return absSum;
-    template<typename vector_type>
-    void initialize(const Support &sup, const vector_type & particles)
+    template<typename vector_type, typename vector_type2>
+    void initialize(const Support &sup, const vector_type & particlesFrom, vector_type2 &particlesTo)
     	auto & keys = sup.getKeys();
     	for (int i = 0 ; i < keys.size() ; i++)
-    		Point<dim,T> p = particles.getPosOrig(sup.getReferencePointKey());
-    		p -= particles.getPosOrig(keys[i]);
-    		offsets.push_back(p);
+    		Point<dim,T> p = particlesTo.getPosOrig(sup.getReferencePointKey());
+            p -= particlesFrom.getPosOrig(keys.get(i));
+            offsets.add(p);
         // First check that the number of points given is enough for building the Vandermonde matrix
@@ -95,7 +112,7 @@ private:
         // Compute eps for this point
         //factor here. This is C factor.
-        computeEps(2);
+        computeEps(HOverEpsilon);
diff --git a/src/DCPSE/VandermondeRowBuilder.hpp b/src/DCPSE/VandermondeRowBuilder.hpp
index 5b893aea..2de39fa0 100644
--- a/src/DCPSE/VandermondeRowBuilder.hpp
+++ b/src/DCPSE/VandermondeRowBuilder.hpp
@@ -1,5 +1,6 @@
 // Created by tommaso on 22/03/19.
+// Modified by Serhii
@@ -7,31 +8,32 @@
 #include "MonomialBasis.hpp"
-template <unsigned int dim, typename T>
+template <unsigned int dim, typename T, typename MonomialBasis_type = MonomialBasis<dim>>
 class VandermondeRowBuilder
-    const MonomialBasis<dim> monomialBasis;
+    const MonomialBasis_type& monomialBasis;
-    VandermondeRowBuilder(const MonomialBasis<dim> &monomialBasis) : monomialBasis(monomialBasis) {}
+    VandermondeRowBuilder(const MonomialBasis_type &monomialBasis) : monomialBasis(monomialBasis) {}
     template <typename MatrixType>
     void buildRow(MatrixType &M, unsigned int row, Point<dim, T> x, T eps);
-template<unsigned int dim, typename T>
+template<unsigned int dim, typename T,typename MonomialBasis_type>
 template <typename MatrixType>
-void VandermondeRowBuilder<dim, T>::buildRow(MatrixType &M, unsigned int row, Point<dim, T> x, T eps)
+void VandermondeRowBuilder<dim, T, MonomialBasis_type>::buildRow(MatrixType &M, unsigned int row, Point<dim, T> x, T eps)
-    unsigned int col = 0;
-    for (auto& basisElement : monomialBasis.getElements())
+    auto& basisElements = monomialBasis.getElements();
+    for (size_t col = 0; col < basisElements.size(); ++col)
-        Monomial<dim> m = monomialBasis.getElement(col);
+        Monomial<dim> m = basisElements.get(col);
         M(row, col) = m.evaluate(x);
         M(row, col) /= openfpm::math::intpowlog(eps, m.order());
-        ++col;
diff --git a/src/DCPSE/tests/Support_unit_tests.cpp b/src/DCPSE/tests/Support_unit_tests.cpp
index 86fc3a5c..cb892c0b 100644
--- a/src/DCPSE/tests/Support_unit_tests.cpp
+++ b/src/DCPSE/tests/Support_unit_tests.cpp
@@ -53,18 +53,12 @@ BOOST_AUTO_TEST_SUITE(Support_tests)
         // Get spatial position from point iterator
         vect_dist_key_dx p = itPoint.get();
         const auto pos = domain.getPos(p.getKey());
-        //std::cout << "p=(" << pos[0] << "," << pos[1] << ")" << std::endl;
-//        BOOST_REQUIRE_CLOSE(pos[0], 0, 1e-16);
-//        BOOST_REQUIRE_CLOSE(pos[1], 0, 1e-16);
-        // Now that domain is built and populated, let's test SupportBuilder
-        // We use (0,0) as initial point
-        SupportBuilder<vector_dist<2, double, aggregate<double>>> supportBuilder(domain, {1,0}, 2*spacing[0]);
+        typedef vector_dist<2, double, aggregate<double>> vector_dist_type;
+        SupportBuilder<vector_dist_type,vector_dist_type> supportBuilder(domain, domain, {1,0}, 2*spacing[0],false);
         auto support = supportBuilder.getSupport(itPoint, 6, support_options::N_PARTICLES);
-//        for (const auto &off : support.getOffsets())
-//        {
-//            std::cout << off.toString() << std::endl;
-//        }
         BOOST_REQUIRE_GE(support.size(), 6);
@@ -113,25 +107,15 @@ BOOST_AUTO_TEST_SUITE(Support_tests)
         // Get spatial position from point iterator
         vect_dist_key_dx p = itPoint.get();
         const auto pos = domain.getPos(p.getKey());
-        //std::cout << "p=(" << pos[0] << "," << pos[1] << ")" << std::endl;
-//        BOOST_REQUIRE_CLOSE(pos[0], 0, 1e-16);
-//        BOOST_REQUIRE_CLOSE(pos[1], 0, 1e-16);
-        // Now that domain is built and populated, let's test SupportBuilder
-        // We use (0,0) as initial point
-        SupportBuilder<vector_dist<2, double, aggregate<double>>> supportBuilder(domain, {2,2}, 2*spacing[0]);
+        typedef vector_dist<2, double, aggregate<double>> vector_dist_type;
+        SupportBuilder<vector_dist_type,vector_dist_type> supportBuilder(domain, domain, {2,2}, 2*spacing[0],false);
         auto supportPoints = supportBuilder.getSupport(itPoint, 20, support_options::N_PARTICLES);
-//        for (const auto &k : supportPoints)
-//        {
-//            Point<2, double> pt = domain.getPos(k);
-//            std::cout << pt.toString() << std::endl;
-//        }
         BOOST_REQUIRE_GE(supportPoints.size(), 20);
-//    BOOST_AUTO_TEST_CASE(Support_CopyConstructor_test)
-//    {
-//    }
diff --git a/src/DCPSE/tests/Vandermonde_unit_tests.cpp b/src/DCPSE/tests/Vandermonde_unit_tests.cpp
index 295b5505..dd133d66 100644
--- a/src/DCPSE/tests/Vandermonde_unit_tests.cpp
+++ b/src/DCPSE/tests/Vandermonde_unit_tests.cpp
@@ -109,7 +109,7 @@ BOOST_AUTO_TEST_SUITE(Vandermonde_tests)
         Support s(0,keys);
         // ...and get the matrix V
-        Vandermonde<2, double, EMatrix<double, Eigen::Dynamic, Eigen::Dynamic>> vandermonde(s, mb, parts);
+        Vandermonde<2, double, EMatrix<double, Eigen::Dynamic, Eigen::Dynamic>> vandermonde(s, mb, parts,parts);
         // Now build the matrix of expected values
@@ -201,7 +201,7 @@ BOOST_AUTO_TEST_SUITE(Vandermonde_tests)
         Support s(0,keys);
         // ...and get the matrix V
-        Vandermonde<2, double, EMatrix<double, Eigen::Dynamic, Eigen::Dynamic>> vandermonde(s, mb, parts);
+        Vandermonde<2, double, EMatrix<double, Eigen::Dynamic, Eigen::Dynamic>> vandermonde(s, mb, parts,parts);
         // Now build the matrix of expected values
diff --git a/src/FiniteDifference/FD_expressions.hpp b/src/FiniteDifference/FD_expressions.hpp
index 89b6b822..d3499a02 100644
--- a/src/FiniteDifference/FD_expressions.hpp
+++ b/src/FiniteDifference/FD_expressions.hpp
@@ -8,9 +8,34 @@
+template<typename T, typename Sfinae = void>
+struct has_getGrid: std::false_type {};
+template<typename T>
+struct has_getGrid<T, typename Void<decltype(std::declval<T>().getGrid())>::type > : std::true_type
 namespace FD
+	template<bool cond, typename exp1, typename exp2>
+	struct first_or_second
+	{
+		static auto getGrid(const exp1 & o1, const exp2 & o2) -> decltype(o2.getGrid())
+		{
+			return o2.getGrid();
+		}
+	};
+	template<typename exp1, typename exp2>
+	struct first_or_second<true,exp1,exp2>
+	{
+		static auto getGrid(const exp1 & o1, const exp2 & o2) -> decltype(o1.getGrid())
+		{
+			return o1.getGrid();
+		}
+	};
 	constexpr int NORM_EXPRESSION = 0;
 	constexpr int STAG_EXPRESSION = 1;
 	constexpr int GRID_COMP = 2;
@@ -26,22 +51,22 @@ namespace FD
 	struct grid_dist_expression_value_impl_func_scal
 		template<unsigned int prp, typename base_type, typename gtype>
-		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out, int & c, int comp)
+		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out, int & c)
 			if (c_where[i] != c_o1[i])
 				int sign = (c_where[i] > c_o1[i])?1:-1;
-				grid_dist_expression_value_impl_func_scal<i-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte_out,c,comp);
+				grid_dist_expression_value_impl_func_scal<i-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte_out,c);
 				long int x0 = k.getKeyRef().get(i);
 				k.getKeyRef().set_d(i, x0 + sign);
-				grid_dist_expression_value_impl_func_scal<i-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte_out,c,comp);
+				grid_dist_expression_value_impl_func_scal<i-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte_out,c);
 				k.getKeyRef().set_d(i, x0);
-				grid_dist_expression_value_impl_func_scal<i-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte_out,c,comp);
+				grid_dist_expression_value_impl_func_scal<i-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte_out,c);
@@ -50,7 +75,7 @@ namespace FD
 	struct grid_dist_expression_value_impl_func_scal<0>
 		template<unsigned int prp, typename base_type, typename gtype>
-		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out , int & c , int comp)
+		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out , int & c)
 			if (c_where[0] != c_o1[0])
@@ -91,12 +116,37 @@ namespace FD
 			return inte;
+		template<unsigned int prp, typename gtype>
+		static base_type inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1)
+		{
+			int c = 0;
+			base_type inte = 0;
+			grid_dist_expression_value_impl_func_scal<gtype::dims-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte,c);
+        	inte /= c;
+			return inte;
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k)
+		{
+        	return g.template getProp<prp>(k);
+		}
 		template<unsigned int prp, typename gtype>
 		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, int comp)
         	return g.template getProp<prp>(k);
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k) -> decltype(g.template getProp<prp>(k))
+		{
+        	return g.template getProp<prp>(k);
+		}
 		template<unsigned int prp, typename gtype>
 		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, int comp) -> decltype(g.template getProp<prp>(k))
@@ -109,7 +159,7 @@ namespace FD
 	struct grid_dist_expression_value_impl_func_vec
 		template<unsigned int prp, typename base_type, typename gtype>
-		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out, int & c, int comp)
+		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out, int & c, const int (& comp)[1])
 			if (c_where[i] != c_o1[i])
@@ -133,24 +183,24 @@ namespace FD
 	struct grid_dist_expression_value_impl_func_vec<0>
 		template<unsigned int prp, typename base_type, typename gtype>
-		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out , int & c , int comp)
+		static void inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, base_type & inte_out , int & c , const int (& comp)[1])
 			if (c_where[0] != c_o1[0])
 				int sign = (c_where[0] > c_o1[0])?1:-1;
-				inte_out += g.template getProp<prp>(k)[comp];
+				inte_out += g.template getProp<prp>(k)[comp[0]];
 				long int x0 = k.getKeyRef().get(0);
 				k.getKeyRef().set_d(0, x0 + sign);
-				inte_out += g.template getProp<prp>(k)[comp];
+				inte_out += g.template getProp<prp>(k)[comp[0]];
 				k.getKeyRef().set_d(0, x0);
 				c += 2;
-				inte_out += g.template getProp<prp>(k)[comp];
+				inte_out += g.template getProp<prp>(k)[comp[0]];
 				c += 1;
@@ -162,7 +212,7 @@ namespace FD
 		typedef base_type type;
 		template<unsigned int prp, typename gtype>
-		static base_type inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, int comp)
+		static base_type inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, const int (& comp)[1])
 			int c = 0;
 			base_type inte = 0;
@@ -178,15 +228,142 @@ namespace FD
 		template<unsigned int prp, typename gtype>
-		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, int comp)
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k)
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0];
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[1])
+		{
+        	return g.template getProp<prp>(k)[comp[0]];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k) -> decltype(g.template getProp<prp>(k)[0])
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[1]) -> decltype(g.template getProp<prp>(k)[comp[0]])
+		{
+        	return g.template getProp<prp>(k)[comp[0]];
+		}
+	};
+	template<typename base_type, unsigned int N1,unsigned int N2>
+	struct grid_dist_expression_value_impl<base_type[N1][N2]>
+	{
+		typedef base_type type;
+		template<unsigned int prp, typename gtype>
+		static base_type inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, const int (& comp)[2])
+		{
+			int c = 0;
+			base_type inte = 0;
+        	grid_dist_expression_value_impl_func_vec<gtype::dims-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte,c,comp);
+        	if (c != 0)
+        	{inte /= c;}
+        	else
+        	{inte = g.template getProp<prp>(k)[comp[0]][comp[1]];}
+			return inte;
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k)
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0][0];
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[2])
+		{
+        	return g.template getProp<prp>(k)[comp[0]][comp[1]];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k) -> decltype(g.template getProp<prp>(k)[0][0])
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0][0];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[2]) -> decltype(g.template getProp<prp>(k)[0][0])
+		{
+        	return g.template getProp<prp>(k)[comp[0]][comp[1]];
+		}
+	};
+	template<typename base_type, unsigned int N1,unsigned int N2, unsigned int N3>
+	struct grid_dist_expression_value_impl<base_type[N1][N2][N3]>
+	{
+		typedef base_type type;
+		template<unsigned int prp, typename gtype>
+		static base_type inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, const int (& comp)[3])
+		{
+			int c = 0;
+			base_type inte = 0;
+        	grid_dist_expression_value_impl_func_vec<gtype::dims-1>::template inte<prp,base_type>(g,k,c_where,c_o1,inte,c,comp);
+        	if (c != 0)
+        	{inte /= c;}
+        	else
+        	{inte = g.template getProp<prp>(k)[comp[0]][comp[1]][comp[2]];}
+			return inte;
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k)
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0][0][0];
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[2])
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0][comp[0]][comp[1]];
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[3])
+		{
+        	return g.template getProp<prp>(k)[comp[0]][comp[1]][comp[2]];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k) -> decltype(g.template getProp<prp>(k)[0][0][0])
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0][0][0];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[2]) -> decltype(g.template getProp<prp>(k)[0][0][0])
-        	return g.template getProp<prp>(k)[comp];
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0][comp[1]][comp[0]];
 		template<unsigned int prp, typename gtype>
-		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, int comp) -> decltype(g.template getProp<prp>(k)[comp])
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[3]) -> decltype(g.template getProp<prp>(k)[0][0][0])
-        	return g.template getProp<prp>(k)[comp];
+        	return g.template getProp<prp>(k)[comp[0]][comp[1]][comp[2]];
@@ -196,8 +373,10 @@ namespace FD
 		typedef base_type type;
 		template<unsigned int prp, typename gtype>
-		static base_type inte(gtype & g, grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, int comp)
+		static base_type inte(gtype & g, const grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1)
+			int comp[1];
+			printf("Error wrong expression please check the components");
 			int c = 0;
 			base_type inte = 0;
@@ -206,21 +385,53 @@ namespace FD
         	if (c != 0)
         	{inte /= c;}
-        	{inte = g.template getProp<prp>(k)[comp];}
+        	{inte = g.template getProp<prp>(k)[0];}
 			return inte;
 		template<unsigned int prp, typename gtype>
-		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, int comp)
+		static base_type inte(gtype & g, const grid_dist_key_dx<gtype::dims> & k, comb<gtype::dims> & c_where, comb<gtype::dims> & c_o1, const int (& comp)[1])
-        	return g.template getProp<prp>(k)[comp];
+			int c = 0;
+			base_type inte = 0;
+			grid_dist_key_dx<gtype::dims> k_ = k;
+			grid_dist_expression_value_impl_func_vec<gtype::dims-1>::template inte<prp,base_type>(g,k_,c_where,c_o1,inte,c,comp);
+        	if (c != 0)
+        	{inte /= c;}
+        	else
+        	{inte = g.template getProp<prp>(k)[comp[0]];}
+			return inte;
 		template<unsigned int prp, typename gtype>
-		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, int comp) -> decltype(g.template getProp<prp>(k)[comp])
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k)
-        	return g.template getProp<prp>(k)[comp];
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0];
+		}
+		template<unsigned int prp, typename gtype>
+		static base_type value_n(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[1])
+		{
+        	return g.template getProp<prp>(k)[comp[0]];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k, const int (& comp)[1]) -> decltype(g.template getProp<prp>(k)[comp[0]])
+		{
+        	return g.template getProp<prp>(k)[comp[0]];
+		}
+		template<unsigned int prp, typename gtype>
+		static auto value_ref(gtype & g, const grid_dist_key_dx<gtype::dims> & k) -> decltype(g.template getProp<prp>(k)[0])
+		{
+			printf("Error wrong expression please check the components");
+        	return g.template getProp<prp>(k)[0];
@@ -369,10 +580,22 @@ namespace FD
 		 * \return the result of the expression
-		inline auto value(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, int comp = 0) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_n<prp>(g,k,comp))
+		inline auto value(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_n<prp>(g,k))
+		{
+			return grid_dist_expression_value_impl<type_proc>::template value_n<prp>(g,k);
+		}
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		template<unsigned int nc>
+		inline auto value(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, const int (& comp)[nc]) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_n<prp>(g,k,comp))
 			return grid_dist_expression_value_impl<type_proc>::template value_n<prp>(g,k,comp);
-//			return g.template getProp<prp>(k);
 		/*! \brief Evaluate the expression
@@ -382,10 +605,22 @@ namespace FD
 		 * \return the result of the expression
-		inline auto value_ref(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, int comp = 0) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k,comp))
+		inline auto value_ref(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k))
+		{
+			return grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k);
+		}
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		template<unsigned int nc>
+		inline auto value_ref(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, const int (& comp)[nc]) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k,comp))
 			return grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k,comp);
-//			return g.template getProp<prp>(k);
 		/*! \brief Fill the grid property with the evaluated expression
@@ -395,12 +630,12 @@ namespace FD
 		 * \return itself
-		template<unsigned int prp2> grid & operator=(const grid_dist_expression<prp2,grid,NORM_EXPRESSION> & g_exp)
+		template<unsigned int prp2,typename grid_type> grid & operator=(const grid_dist_expression<prp2,grid_type,NORM_EXPRESSION> & g_exp)
 			comb<grid::dims> s_pos;
 			auto it = g.getDomainIterator();
@@ -575,7 +810,21 @@ namespace FD
 		 * \return the result of the expression
-		inline auto value_ref(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where,int comp = 0) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k,comp))
+		inline auto value_ref(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k))
+		{
+			return grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k);
+			//return g.template getProp<prp>(k);
+		}
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		template<unsigned int nc>
+		inline auto value_ref(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, const int (& comp)[nc]) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k,comp))
 			return grid_dist_expression_value_impl<type_proc>::template value_ref<prp>(g,k,comp);
 			//return g.template getProp<prp>(k);
@@ -588,11 +837,27 @@ namespace FD
 		 * \return the result of the expression
-		inline auto value(grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, int comp = 0) const -> decltype(grid_dist_expression_value_impl<type_proc>::template inte<prp>(g,k,c_where,c_where,comp))
+		inline auto value(grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where) const -> decltype(grid_dist_expression_value_impl<type_proc>::template inte<prp>(g,k,c_where,c_where))
-			comb<grid::dims> c_o1 = g.getStagPositions()[prp].get(comp);
+			comb<grid::dims> c_o1 = g.getStagPositions()[prp].get(0);
+			return grid_dist_expression_value_impl<type_proc>::template inte<prp>(g,k,c_where,c_o1);
+		}
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		template<unsigned int nc>
+		inline auto value(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, const int (& comp)[nc]) const -> decltype(grid_dist_expression_value_impl<type_proc>::template inte<prp>(g,k,c_where,c_where,comp))
+		{
+			comb<grid::dims> c_o1 = g.getStagPositions()[prp].get(comp[0]);
 			return grid_dist_expression_value_impl<type_proc>::template inte<prp>(g,k,c_where,c_o1,comp);
+//			return g.template getProp<prp>(k);
 		/*! \brief Fill the grid property with the evaluated expression
@@ -1136,9 +1401,9 @@ namespace FD
 		 * \return the grid
-		gtype & getGrid()
+		auto getGrid() -> decltype(first_or_second<has_getGrid<exp1>::value,exp1,exp2>::getGrid(o1,o2))
-			return o1.getGrid();
+			return first_or_second<has_getGrid<exp1>::value,exp1,exp2>::getGrid(o1,o2);
 		/*! \brief Return the grid on which is acting
@@ -1148,9 +1413,9 @@ namespace FD
 		* \return the grid
-		const gtype & getGrid() const
+		auto getGrid() const -> decltype(first_or_second<has_getGrid<exp1>::value,exp1,exp2>::getGrid(o1,o2))
-			return o1.getGrid();
+			return first_or_second<has_getGrid<exp1>::value,exp1,exp2>::getGrid(o1,o2);
 		template<typename Sys_eqs, typename gmap_type, typename unordered_map_type>
@@ -1220,17 +1485,27 @@ namespace FD
 		template<typename exp_type>
 		static int get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[1])
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
 			return 0;
+		template<typename exp_type>
+		static auto get_ref(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[1]) -> decltype(o1.value_ref(key,c_where))
+		{
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+			return o1.value_ref(key,c_where);
+		}
 		template<unsigned int prop, typename exp_type, typename grid_type>
 		inline static void assign(exp_type & o1, grid_type & g, const grid_dist_key_dx<exp_type::gtype::dims> & key)
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
 		template<unsigned int prop, typename grid_type>
 		inline static void assign_double(double d, grid_type & g, const grid_dist_key_dx<grid_type::dims> & key)
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
@@ -1238,15 +1513,15 @@ namespace FD
 	struct get_grid_dist_expression_op<1,true>
 		template<typename exp_type>
-		static auto get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[1]) -> decltype(o1.value(key,c_where,comp[0]))
+		static auto get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[1]) -> decltype(o1.value(key,c_where,comp) )
-			return o1.value(key,c_where,comp[0]);
+			return o1.value(key,c_where,comp);
 		template<typename exp_type>
-		static auto get_ref(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[1]) -> decltype(o1.value_ref(key,c_where,comp[0]))
+		static auto get_ref(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[1]) -> decltype(o1.value_ref(key,c_where,comp) )
-			return o1.value_ref(key,c_where,comp[0]);
+			return o1.value_ref(key,c_where,comp);
 		template<unsigned int prop,typename exp_type, typename grid_type>
@@ -1262,19 +1537,57 @@ namespace FD
+	template<>
+	struct get_grid_dist_expression_op<2,false>
+	{
+		template<typename exp_type>
+		static auto get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2]) -> decltype(o1.value(key,c_where,comp) )
+		{
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+			return o1.value(key,c_where,comp);
+		}
+		template<typename exp_type>
+		static auto get_ref(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2]) -> decltype(o1.value_ref(key,c_where,comp) )
+		{
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+			return o1.value_ref(key,c_where,comp);
+		}
+		template<unsigned int prop,typename exp_type, typename grid_type>
+		inline static void assign(exp_type & o1, grid_type & g, grid_dist_key_dx<grid_type::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2])
+		{
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+			pos_or_propL<grid_type,prop>::value(g,key)[comp[0]][comp[1]] = o1.value(key,c_where);
+		}
+		template<unsigned int prop, typename grid_type>
+		inline static void assign_double(double d, grid_type & g, const grid_dist_key_dx<grid_type::dims> & key, const int (& comp)[2])
+		{
+			printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+			pos_or_propL<grid_type,prop>::value(g,key)[comp[0]][comp[1]] = d;
+		}
+	};
 	struct get_grid_dist_expression_op<2,true>
 		template<typename exp_type>
-		static auto get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2]) -> decltype(o1.value(key,c_where)[0][0])
+		static auto get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2]) -> decltype(o1.value(key,c_where,comp) )
+		{
+			return o1.value(key,c_where,comp);
+		}
+		template<typename exp_type>
+		static auto get_ref(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2]) -> decltype(o1.value_ref(key,c_where,comp) )
-			return o1.value(key)[comp[0]][comp[1]];
+			return o1.value_ref(key,c_where,comp);
 		template<unsigned int prop,typename exp_type, typename grid_type>
-		inline static void assign(exp_type & o1, grid_type & g, const grid_dist_key_dx<grid_type::dims> & key, const int (& comp)[2])
+		inline static void assign(exp_type & o1, grid_type & g, grid_dist_key_dx<grid_type::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[2])
-			pos_or_propL<grid_type,prop>::value(g,key)[comp[0]][comp[1]] = o1.value(key);
+			pos_or_propL<grid_type,prop>::value(g,key)[comp[0]][comp[1]] = o1.value(key,c_where);
 		template<unsigned int prop, typename grid_type>
@@ -1284,6 +1597,34 @@ namespace FD
+	template<>
+	struct get_grid_dist_expression_op<3,true>
+	{
+		template<typename exp_type>
+		static auto get(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[3]) -> decltype(o1.value(key,c_where,comp) )
+		{
+			return o1.value(key,c_where,comp);
+		}
+		template<typename exp_type>
+		static auto get_ref(exp_type & o1, grid_dist_key_dx<exp_type::gtype::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[3]) -> decltype(o1.value_ref(key,c_where,comp) )
+		{
+			return o1.value_ref(key,c_where,comp);
+		}
+		template<unsigned int prop,typename exp_type, typename grid_type>
+		inline static void assign(exp_type & o1, grid_type & g, grid_dist_key_dx<grid_type::dims> & key, comb<exp_type::gtype::dims> & c_where, const int (& comp)[3])
+		{
+			pos_or_propL<grid_type,prop>::value(g,key)[comp[0]][comp[1]][comp[2]] = o1.value(key,c_where);
+		}
+		template<unsigned int prop, typename grid_type>
+		inline static void assign_double(double d, grid_type & g, const grid_dist_key_dx<grid_type::dims> & key, const int (& comp)[3])
+		{
+			pos_or_propL<grid_type,prop>::value(g,key)[comp[0]][comp[1]][comp[2]] = d;
+		}
+	};
 	/*! \brief it take an expression and create the negatove of this expression
@@ -1307,7 +1648,7 @@ namespace FD
-	        typedef std::false_type is_ker;
+	    typedef std::false_type is_ker;
 		typedef typename exp1::gtype gtype;
@@ -1423,7 +1764,7 @@ namespace FD
 	        o1.template value_nz<Sys_eqs>(g_map,key,gs,spacing,cols,coeff,comp_ + var_id + comp[0],c_where);
-	    inline grid_dist_expression_op<exp1,boost::mpl::int_<2>,g_comp> operator[](int comp_)
+	    inline grid_dist_expression_op<exp1,boost::mpl::int_<n+1>,g_comp> operator[](int comp_)
 	    	int comp_n[n+1];
@@ -1431,7 +1772,7 @@ namespace FD
 	    	{comp_n[i] = comp[i];}
 	    	comp_n[n] = comp_;
-	    	grid_dist_expression_op<exp1,boost::mpl::int_<2>,g_comp> v_exp(o1,comp_n,var_id);
+	    	grid_dist_expression_op<exp1,boost::mpl::int_<n+1>,g_comp> v_exp(o1,comp_n,var_id);
 	    	return v_exp;
@@ -1452,7 +1793,7 @@ namespace FD
 		 * \return itself
-		template<unsigned int prp2, unsigned int impl> gtype & operator=(const grid_dist_expression<prp2,gtype,impl> & v_exp)
+	  template<unsigned int prp2, typename gtype2, unsigned int impl> gtype & operator=(const grid_dist_expression<prp2,gtype2,impl> & v_exp)
@@ -1460,11 +1801,14 @@ namespace FD
 			auto it = g.getDomainIterator();
+			comb<gtype::dims> c_where;
 			while (it.isNext())
 				auto key = it.get();
-				get_grid_dist_expression_op<n,n == rank_gen<property_act>::type::value>::template assign<exp1::prop>(v_exp,g,key,comp);
+				get_grid_dist_expression_op<n,n == rank_gen<property_act>::type::value>::template assign<exp1::prop>(v_exp,g,key,c_where,comp);
@@ -1561,8 +1905,368 @@ namespace FD
 		return exp_g;
+////// Specialization for temporal FD_expressions
+	template<unsigned int dim>
+	struct gdb_ext_plus_g_info
+	{
+		grid_sm<dim,void> & ginfo_v;
+		openfpm::vector<GBoxes<dim>> & gdb_ext;
+		bool operator==(const gdb_ext_plus_g_info & tmp)
+		{
+			bool is_equal = gdb_ext.size() == tmp.gdb_ext.size();
+			for (int i = 0 ; i < gdb_ext.size() ; i++)
+			{
+				is_equal &= gdb_ext.get(i) == tmp.gdb_ext.get(i);
+			}
+			is_equal &= ginfo_v == tmp.ginfo_v;
+			return is_equal;
+		}
+	};
+	template<unsigned int dim>
+	class grid_dist_expression_iterator_to_make_algebra_work
+	{
+		//! Grid informations object without type
+		grid_sm<dim,void> & ginfo_v;
+		//! The grid
+		openfpm::vector<grid_cpu<dim,aggregate<double>>> & loc_grid;
+		openfpm::vector<GBoxes<dim>> & gdb_ext;
+		typedef grid_cpu<dim,aggregate<double>> device_grid;
+	public:
+		static constexpr unsigned int dims = dim;
+		grid_dist_expression_iterator_to_make_algebra_work(openfpm::vector<grid_cpu<dim,aggregate<double>>> & loc_grid,
+															openfpm::vector<GBoxes<dim>> & gdb_ext,
+															grid_sm<dim,void> & ginfo_v)
+		:loc_grid(loc_grid),gdb_ext(gdb_ext),ginfo_v(ginfo_v)
+		{}
+		gdb_ext_plus_g_info<dim> size()
+		{
+			return gdb_ext_plus_g_info<dim>{ginfo_v,gdb_ext};
+		}
+        //Need more treatment for staggered (c_where based on exp)
+		template<unsigned int prp>
+        inline auto get(grid_dist_key_dx<dim> & key) -> decltype(loc_grid.get(key.getSub()).template get<0>(key.getKey()))
+        {
+            return loc_grid.get(key.getSub()).template get<0>(key.getKey());
+        }
+		/*! \brief Return the number of local grid
+		*
+		* \return the number of local grid
+		*
+		*/
+		size_t getN_loc_grid() const
+		{
+			return loc_grid.size();
+		}
+		/*! \brief Get the i sub-domain grid
+		*
+		* \param i sub-domain
+		*
+		* \return local grid
+		*
+		*/
+		device_grid & get_loc_grid(size_t i)
+		{
+			return loc_grid.get(i);
+		}
+		/*! \brief Get the i sub-domain grid
+		*
+		* \param i sub-domain
+		*
+		* \return local grid
+		*
+		*/
+		const device_grid & get_loc_grid(size_t i) const
+		{
+			return loc_grid.get(i);
+		}
+		/*! \brief Get an object containing the grid informations without type
+		*
+		* \return an information object about this grid
+		*
+		*/
+		const grid_sm<dim,void> & getGridInfoVoid() const
+		{
+			return ginfo_v;
+		}
+		/*! \brief It return the informations about the local grids
+		*
+		* \return The information about the local grids
+		*
+		*/
+		const openfpm::vector<GBoxes<device_grid::dims>> & getLocalGridsInfo() const
+		{
+			return gdb_ext;
+		}
+		void resize(const gdb_ext_plus_g_info<dim> & input)
+		{
+			size_t Nloc_grid = input.gdb_ext.size();
+			loc_grid.resize(Nloc_grid);
+			for (int i = 0 ; i < Nloc_grid; i++)
+			{
+				size_t sz[dim];
+				for (int j = 0 ; j < dim ; j++)	{sz[j] = input.gdb_ext.get(i).GDbox.getKP2().get(j) + 1;}
+				loc_grid.get(i).resize(sz);
+			}
+			gdb_ext = input.gdb_ext;
+			ginfo_v = input.ginfo_v;
+		}
+		grid_dist_iterator<dim,device_grid,
+					   decltype(device_grid::type_of_subiterator()),FREE> getIterator()
+		{
+			grid_key_dx<dim> stop(ginfo_v.getSize());
+			grid_key_dx<dim> one;
+			stop = stop - one;
+			grid_dist_iterator<dim,device_grid,
+								decltype(device_grid::type_of_subiterator()),
+								FREE> it(loc_grid,gdb_ext,stop);
+			return it;
+		}
+	};
+	template<typename patches>
+	struct grid_patches
+	{
+		static constexpr unsigned int dims = patches::dims;
+		openfpm::vector<patches> loc_grid;
+	};
+	/*! \brief Main class that encapsulate a grid properties operand to be used for expressions construction
+	 *
+	 * \tparam prp property involved
+	 * \tparam grid involved
+	 *
+	 */
+	template<unsigned int dim>
+	class grid_dist_expression<0,grid_patches<grid_cpu<dim,aggregate<double>>>,NORM_EXPRESSION>
+	{
+		//! The grid
+		mutable  grid_patches<grid_cpu<dim,aggregate<double>>> data;
+		mutable openfpm::vector<GBoxes<dim>> gdb_ext;
+		//! Grid informations object without type
+		mutable grid_sm<dim,void> ginfo_v;
+		typedef double type_proc;
+		template<typename super_general>
+		void operator_equal(super_general & g_exp)
+		{
+			g_exp.init();
+			resize(g_exp.getGrid());
+			comb<dim> s_pos;
+			auto it = this->getVector().getIterator();
+			while (it.isNext())
+			{
+				auto key = it.get();
+				data.loc_grid.get(key.getSub()).template get<0>(key.getKey()) = g_exp.value(key,s_pos);
+				++it;
+			}
+		}
+	public:
+		static constexpr unsigned int dims = dim;
+		typedef grid_dist_key_dx<dim,grid_key_dx<dim>> index_type;
+		//! The type of the internal grid
+		typedef grid_dist_expression_iterator_to_make_algebra_work<dim> gtype;
+		//! Property id of the point
+		static const unsigned int prop = 0;
+		grid_dist_expression()
+		{}
+		gdb_ext_plus_g_info<dim> size() const
+		{
+			return gdb_ext_plus_g_info<dim>{ginfo_v,gdb_ext};
+		}
+		//! constructor for an external grid
+		template<typename grid>
+		grid_dist_expression(grid & g)
+		{
+			resize(g);
+		}
+		template<typename grid>
+		void resize(grid & g)
+		{
+			size_t Nloc_grid = g.getN_loc_grid();
+			data.loc_grid.resize(Nloc_grid);
+			for (int i = 0 ; i < Nloc_grid; i++)
+			{
+				data.loc_grid.get(i).resize(g.get_loc_grid(i).getGrid().getSize());
+			}
+			gdb_ext = g.getLocalGridsInfo();
+			ginfo_v = g.getGridInfoVoid();
+		}
+		grid_dist_expression_iterator_to_make_algebra_work<dim> getVector() const
+		{
+			return grid_dist_expression_iterator_to_make_algebra_work<dim>(data.loc_grid,gdb_ext,ginfo_v);
+		}
+		/*! \brief Return the grid on which is acting
+		 *
+		 * It return the grid used in getVExpr, to get this object
+		 *
+		 * \return the grid
+		 *
+		 */
+		grid_dist_expression_iterator_to_make_algebra_work<dim> getGrid()
+		{
+			return getVector();
+		}
+		/*! \brief Return the grid on which is acting
+		*
+		* It return the grid used in getVExpr, to get this object
+		*
+		* \return the grid
+		*
+		*/
+		const grid_dist_expression_iterator_to_make_algebra_work<dim> getGrid() const
+		{
+			return getVector();
+		}
+		/*! \brief This function must be called before value
+		 *
+		 * it initialize the expression if needed
+		 *
+		 */
+		inline void init() const
+		{}
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		inline double value(const grid_dist_key_dx<dim> & k, const comb<dim> & c_where = comb<dim>()) const
+		{
+			return data.loc_grid.get(k.getSub()).template get<0>(k.getKey());
+		}
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		// template<unsigned int nc>
+		// inline auto value(const grid_dist_key_dx<grid::dims> & k, comb<grid::dims> & c_where, const int (& comp)[nc]) const -> decltype(grid_dist_expression_value_impl<type_proc>::template value_n<prp>(g,k,comp))
+		// {
+		// 	return loc_grid.get(k.getSub()).template get<0>(k.getKey());
+		// }
+		/*! \brief Evaluate the expression
+		 *
+		 * \param k where to evaluate the expression
+		 *
+		 * \return the result of the expression
+		 *
+		 */
+		inline double & value_ref(const grid_dist_key_dx<dim> & k, const comb<dim> & c_where = comb<dim>())
+		{
+			return data.loc_grid.get(k.getSub()).template get<0>(k.getKey());
+		}
+		/*! \brief Fill the grid property with the evaluated expression
+		 *
+		 * \param v_exp expression to evaluate
+		 *
+		 * \return itself
+		 *
+		 */
+		template<unsigned int prp2, typename grid> const grid & operator=(const grid_dist_expression<prp2,grid,NORM_EXPRESSION> & g_exp)
+		{
+			operator_equal(g_exp);
+			return g_exp.getGrid();
+		}
+		/*! \brief Fill the grid property with the evaluated expression
+		 *
+		 * \param v_exp expression to evaluate
+		 *
+		 * \return itself
+		 *
+		 */
+		template<typename exp1, typename exp2, typename op> auto operator=(const grid_dist_expression_op<exp1,exp2,op> & g_exp) -> decltype(g_exp.getGrid())
+		{
+			operator_equal(g_exp);
+			return g_exp.getGrid();
+		}
+        //Need more treatment for staggered (c_where based on exp)
+        inline double get(grid_dist_key_dx<dim> & key)
+        {
+            comb<dim> c_where;
+  ;
+            return this->value(key,c_where);
+        }
+		int isConstant(){
+		    return false;
+		}
+	};
+template<unsigned int dim, typename T> using texp_g = FD::grid_dist_expression<0,FD::grid_patches<grid_cpu<dim,aggregate<T>>>,FD::NORM_EXPRESSION>;
 /* \brief sum two distributed grid expression
  * \param ga grid expression one
diff --git a/src/FiniteDifference/FD_op_Tests.cpp b/src/FiniteDifference/FD_op_Tests.cpp
index 1ca8c556..a5abf939 100644
--- a/src/FiniteDifference/FD_op_Tests.cpp
+++ b/src/FiniteDifference/FD_op_Tests.cpp
@@ -114,6 +114,105 @@ BOOST_AUTO_TEST_SUITE(fd_op_suite_tests)
+    BOOST_AUTO_TEST_CASE(fd_op_tests_vec_mat) {
+        size_t edgeSemiSize = 80;
+        const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        periodicity<2> bc({NON_PERIODIC, NON_PERIODIC});
+        double spacing[2];
+        spacing[0] = 2 * M_PI / (sz[0] - 1);
+        spacing[1] = 2 * M_PI / (sz[1] - 1);
+        Ghost<2, long int> ghost(1);
+        //std::cout << "Spacing: " << spacing[0] << " " << spacing[1] << std::endl;
+        grid_dist_id<2, double, aggregate<double, double, double,double[2],double[2][2],double[2][2][2]>> domain(sz, box,ghost,bc);
+        BOOST_TEST_MESSAGE("Init domain...");
+        auto it = domain.getDomainIterator();
+        while (it.isNext())
+        {
+            auto key_l = it.get();
+            auto key = it.getGKey(key_l);
+            mem_id i = key.get(0);
+            double x = i * spacing[0];
+            mem_id j = key.get(1);
+            double y = j * spacing[1];
+            // Here fill the function value P
+            domain.template getProp<3>(key_l)[0] = sin(x);
+            domain.template getProp<3>(key_l)[1] = sin(x);
+            domain.template getProp<1>(key_l) = 0;
+            // Here fill the validation value for Df/Dx in property 3
+            domain.template getProp<2>(key_l) = cos(x);
+            ++it;
+        }
+        domain.ghost_get<0,3>();
+        FD::Derivative_x Dx;
+        FD::Derivative_y Dy;
+        auto v = FD::getV<1>(domain);
+        auto P = FD::getV<0>(domain);
+        auto vec = FD::getV<3>(domain);
+        auto Mat = FD::getV<4>(domain);
+        auto Mat3 = FD::getV<5>(domain);
+        Mat[0][1] = Dx(vec[0]);
+        Mat[1][0] = vec[0];
+        domain.ghost_get<4>();
+        Mat[0][0] = Dx(Mat[1][0]);
+        Mat3[0][0][0] = Dx(vec[0]);
+        Mat3[0][1][0] = Dx(Mat[1][0]);
+        auto it2 = domain.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) 
+        {
+            auto p = it2.get();
+            if (fabs(domain.getProp<4>(p)[0][1] - domain.getProp<2>(p)) > worst) 
+            {
+                worst = fabs(domain.getProp<4>(p)[0][1] - domain.getProp<2>(p));
+            }
+            if (fabs(domain.getProp<4>(p)[1][0] - domain.getProp<3>(p)[0]) > worst) 
+            {
+                worst = fabs(domain.getProp<4>(p)[1][0] - domain.getProp<3>(p)[0]);
+            }
+            if (fabs(domain.getProp<4>(p)[0][0] - domain.getProp<2>(p)) > worst)
+            {
+                worst = fabs(domain.getProp<4>(p)[0][0] - domain.getProp<2>(p));
+            }
+            /////////////////////////// Mat 3
+            if (fabs(domain.getProp<5>(p)[0][0][0] - domain.getProp<2>(p)) > worst)
+            {
+                worst = fabs(domain.getProp<5>(p)[0][0][0] - domain.getProp<2>(p));
+            }
+            if (fabs(domain.getProp<5>(p)[0][1][0] - domain.getProp<2>(p)) > worst)
+            {
+                worst = fabs(domain.getProp<5>(p)[0][1][0] - domain.getProp<2>(p));
+            }
+            ++it2;
+        }
+        BOOST_REQUIRE(worst < 0.003);
+    }
     BOOST_AUTO_TEST_CASE(lalpacian_test) {
         size_t edgeSemiSize = 80;
         const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
diff --git a/src/Matrix/SparseMatrix_petsc.hpp b/src/Matrix/SparseMatrix_petsc.hpp
index 55533924..24cba2a2 100644
--- a/src/Matrix/SparseMatrix_petsc.hpp
+++ b/src/Matrix/SparseMatrix_petsc.hpp
@@ -196,7 +196,7 @@ private:
+        PETSC_SAFE_CALL(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY));
 		m_created = true;
@@ -223,6 +223,7 @@ public:
+        PETSC_SAFE_CALL(MatSetFromOptions(mat));
 		Vcluster<> & v_cl = create_vcluster();
@@ -245,7 +246,9 @@ public:
+        PETSC_SAFE_CALL(MatSetType(mat,MATMPIAIJ));
+        PETSC_SAFE_CALL(MatSetFromOptions(mat));
diff --git a/src/OdeIntegrators/OdeIntegrators.hpp b/src/OdeIntegrators/OdeIntegrators.hpp
index 1aad7092..29334842 100644
--- a/src/OdeIntegrators/OdeIntegrators.hpp
+++ b/src/OdeIntegrators/OdeIntegrators.hpp
@@ -16,10 +16,12 @@ struct has_state_vector: std::false_type {};
 template<typename T>
 struct has_state_vector<T, typename Void< typename T::is_state_vector>::type> : std::true_type
 namespace boost{
     template<class T,class Enabler=typename std::enable_if<has_state_vector<T>::value>::type>
-    inline size_t
-    size(const T& rng)
+    inline auto
+    size(const T& rng) -> decltype(rng.size())
         return rng.size();
@@ -27,7 +29,59 @@ namespace boost{
 #include <boost/numeric/odeint.hpp>
 #include "Operators/Vector/vector_dist_operators.hpp"
-#include "OdeIntegrators/boost_vector_algebra_ofp.hpp"
+#include "FiniteDifference/FD_expressions.hpp"
+#include "OdeIntegrators/vector_algebra_ofp.hpp"
+#ifdef __NVCC__
+#include "OdeIntegrators/vector_algebra_ofp_gpu.hpp"
+/*! \brief A 1d Odeint and Openfpm compatible structure.
+ *
+ *  Use the method<d>() to refer to property of all the particles in the dimension d.
+ *
+ * d starts with 0.
+ *
+ */
+struct state_type_1d_ofp_ker{
+    state_type_1d_ofp_ker(){
+    }
+    typedef decltype(std::declval<texp_v_gpu<double>>().getVector().toKernel()) state_kernel;
+    typedef size_t size_type;
+    typedef int is_state_vector;
+    aggregate<state_kernel> data;
+    __host__ __device__ size_t size() const
+    { return data.get<0>().size(); }
+/*! \brief A 1d Odeint and Openfpm compatible structure.
+ *
+ *  Use the method<d>() to refer to property of all the particles in the dimension d.
+ *
+ * d starts with 0.
+ *
+ */
+struct state_type_1d_ofp_gpu{
+    state_type_1d_ofp_gpu(){
+    }
+    typedef size_t size_type;
+    typedef int is_state_vector;
+    aggregate<texp_v_gpu<double>> data;
+    size_t size() const
+    { return data.get<0>().size(); }
+    void resize(size_t n)
+    {
+        data.get<0>().resize(n);
+    }
+    state_type_1d_ofp_ker toKernel() const
+    {
+        state_type_1d_ofp_ker s1_ker;
+        return s1_ker;
+    }
 namespace boost { namespace numeric { namespace odeint {
@@ -51,6 +105,7 @@ struct state_type_1d_ofp{
     typedef size_t size_type;
+    typedef size_t index_type;
     typedef int is_state_vector;
     aggregate<texp_v<double>> data;
@@ -74,6 +129,7 @@ struct state_type_2d_ofp{
     typedef size_t size_type;
+    typedef size_t index_type;
     typedef int is_state_vector;
     aggregate<texp_v<double>,texp_v<double>> data;
@@ -98,6 +154,7 @@ struct state_type_3d_ofp{
     typedef size_t size_type;
+    typedef size_t index_type;
     typedef int is_state_vector;
     aggregate<texp_v<double>,texp_v<double>,texp_v<double>> data;
@@ -123,6 +180,7 @@ struct state_type_4d_ofp{
     typedef size_t size_type;
+    typedef size_t index_type;
     typedef int is_state_vector;
     aggregate<texp_v<double>,texp_v<double>,texp_v<double>,texp_v<double>> data;
@@ -149,6 +207,7 @@ struct state_type_5d_ofp{
     typedef size_t size_type;
+    typedef size_t index_type;
     typedef int is_state_vector;
     aggregate<texp_v<double>,texp_v<double>,texp_v<double>,texp_v<double>,texp_v<double>> data;
@@ -166,15 +225,61 @@ struct state_type_5d_ofp{
+template<int counter, typename state_type, typename ... list>
+struct state_type_ofpm_add_elements
+//    typedef aggregate<list ..., texp_v<double>> one_more;
+    typedef typename state_type_ofpm_add_elements<counter-1,state_type, state_type,list ...>::type type;
+template<typename state_type, typename ... list>
+struct state_type_ofpm_add_elements<0,state_type,list ...>
+   typedef aggregate<list ...> type;
+template<int n_state, typename state_type>
+struct state_type_ofpm_impl
+    typedef FD::gdb_ext_plus_g_info<state_type::dims> size_type;
+    typedef typename state_type::index_type index_type;
+    typedef int is_state_vector;
+    typedef typename state_type_ofpm_add_elements<n_state-1,state_type, state_type>::type type_data;
+    type_data data;
+    FD::gdb_ext_plus_g_info<state_type::dims> size() const
+    {
+        return data.template get<0>().size();
+    }
+    void resize(const FD::gdb_ext_plus_g_info<state_type::dims> & rsz_obj)
+    {
+        // to fill
+    }
 namespace boost {
     namespace numeric {
         namespace odeint {
+            // FOR particles
             struct is_resizeable<state_type_1d_ofp> {
             typedef boost::true_type type;
             static const bool value = type::value;
+#ifdef __NVCC__
+            template<>
+            struct is_resizeable<state_type_1d_ofp_gpu> {
+                typedef boost::true_type type;
+                static const bool value = type::value;
+            };
             struct is_resizeable<state_type_2d_ofp> {
                 typedef boost::true_type type;
@@ -229,6 +334,98 @@ namespace boost {
                 typedef double result_type;
+            // For GRIDs
+            template<typename state_type>
+            struct is_resizeable<state_type_ofpm_impl<1,state_type> > {
+            typedef boost::true_type type;
+            static const bool value = type::value;
+            };
+            template<typename state_type>
+            struct is_resizeable<state_type_ofpm_impl<2,state_type> > {
+            typedef boost::true_type type;
+            static const bool value = type::value;
+            };
+            template<typename state_type>
+            struct is_resizeable<state_type_ofpm_impl<3,state_type> > {
+            typedef boost::true_type type;
+            static const bool value = type::value;
+            };
+            template<typename state_type>
+            struct is_resizeable<state_type_ofpm_impl<4,state_type> > {
+            typedef boost::true_type type;
+            static const bool value = type::value;
+            };
+            template<typename state_type>
+            struct is_resizeable<state_type_ofpm_impl<5,state_type> > {
+            typedef boost::true_type type;
+            static const bool value = type::value;
+            };
+/*            template<>
+            struct is_resizeable<state_type_2d_ofp> {
+                typedef boost::true_type type;
+                static const bool value = type::value;
+            };
+            template<>
+            struct is_resizeable<state_type_3d_ofp> {
+                typedef boost::true_type type;
+                static const bool value = type::value;
+            };
+            template<>
+            struct is_resizeable<state_type_4d_ofp> {
+                typedef boost::true_type type;
+                static const bool value = type::value;
+            };
+            template<>
+            struct is_resizeable<state_type_5d_ofp> {
+                typedef boost::true_type type;
+                static const bool value = type::value;
+            };*/
+/*      //      template<unsigned int nprp, typename state_type>
+            struct vector_space_norm_inf<state_type_ofpm_impl<nprp,state_type>>
+            {
+                typedef double result_type;
+            };*/
+            template<typename state_type>
+            struct vector_space_norm_inf<state_type_ofpm_impl<1,state_type>>
+            {
+                typedef double result_type;
+            };
+            template<typename state_type>
+            struct vector_space_norm_inf<state_type_ofpm_impl<2,state_type>>
+            {
+                typedef double result_type;
+            };
+            template<typename state_type>
+            struct vector_space_norm_inf<state_type_ofpm_impl<3,state_type>>
+            {
+                typedef double result_type;
+            };
+            template<typename state_type>
+            struct vector_space_norm_inf<state_type_ofpm_impl<4,state_type>>
+            {
+                typedef double result_type;
+            };
+            template<typename state_type>
+            struct vector_space_norm_inf<state_type_ofpm_impl<5,state_type>>
+            {
+                typedef double result_type;
+            };
diff --git a/src/OdeIntegrators/tests/OdeIntegrator_grid_tests.cpp b/src/OdeIntegrators/tests/OdeIntegrator_grid_tests.cpp
new file mode 100644
index 00000000..a365a46a
--- /dev/null
+++ b/src/OdeIntegrators/tests/OdeIntegrator_grid_tests.cpp
@@ -0,0 +1,576 @@
+// Created by foggia on 19th Jan 2022
+// It's a modification of Abhinav's test, adapted for grids
+#include <iostream>
+#include <boost/test/unit_test.hpp>
+#include "config.h"
+#include "Grid/grid_dist_id.hpp"
+#include "OdeIntegrators/OdeIntegrators.hpp"
+#include "FiniteDifference/FD_op.hpp"
+#include "util/util_debug.hpp"
+#include "util/common.hpp"
+const double a = 2.8e-4;
+const double b = 5e-3;
+const double tau = .1;
+const double k = .005;
+const int dim = 2;
+void *gridGlobal;
+typedef grid_dist_id<2,double,aggregate<double,double,double,double,double,double>> grid_type;
+// State types for systems with different number of ODEs
+typedef state_type_ofpm_impl<1,texp_g<dim,double>> state_type_1ode;
+typedef state_type_ofpm_impl<2,texp_g<dim,double>> state_type_2ode;
+typedef state_type_ofpm_impl<3,texp_g<dim,double>> state_type_3ode;
+template<typename DX, typename DY>
+struct Fitz {
+  DX & ddx;
+  DY & ddy;
+  //Constructor
+  Fitz(DX & m_ddx, DY & m_ddy) :
+    ddx(m_ddx),
+    ddy(m_ddy) {}
+  void operator()(const state_type_2ode & x,
+		  state_type_2ode & dxdt,
+		  const double t) const {
+    grid_type & temp = *(grid_type *) gridGlobal;
+    auto u{FD::getV<4>(temp)};
+    auto v{FD::getV<5>(temp)};
+    u =<0>();
+    v =<1>();
+    temp.ghost_get<4,5>();
+<0>() = ddx(u) + ddy(u) + (1.0);
+<1>() = ddx(v) + ddy(v) + (2.0);
+    // One point stay fixed
+    auto key2 = temp.getDomainIterator().get();
+    if (create_vcluster().rank() == 0) {
+<0>().value_ref(key2) = 0.0;
+<1>().value_ref(key2) = 0.0;
+    }
+    double u_max{0.0};
+    double v_max{0.0};
+    auto it = temp.getDomainIterator();
+    while (it.isNext())
+    {
+      auto key = it.get();
+      if (u_max <<0>().value(key))
+        u_max =<0>().value(key);
+      if (v_max <<1>().value(key))
+        v_max =<1>().value(key);
+      ++it;
+    }
+  }
+void Exponential_struct_ofp2(const state_type_3ode & x,
+			     state_type_3ode & dxdt,
+			     const double t) {
+  // sytem: dx1/dt = x1 --> solution: x1(t) = exp(t)
+  // sytem: dx2/dt = 2*x2 --> solution: x2(t) = exp(2t)
+<0>() =<0>();
+<1>() = 2.0 *<1>();
+<2>() =<0>();
+void Exponential(const state_type_1ode & x,
+		 state_type_1ode & dxdt,
+		 const double t) {
+  // sytem: dx/dt = x --> solution: x(t) = exp(t)
+  dxdt = x;
+// void sigmoid(const state_type_1ode & x,
+// 	     state_type_1ode & dxdt,
+// 	     const double t) {
+//   dxdt = x * (1.0 - x);
+// }
+BOOST_AUTO_TEST_CASE(odeint_grid_test_exponential) {
+  size_t edgeSemiSize{40};
+  const size_t sz[dim] = {edgeSemiSize,edgeSemiSize};
+  Box<dim,double> box{{0.0,0.0}, {1.0,1.0}};
+  periodicity<dim> bc{{NON_PERIODIC,NON_PERIODIC}};
+  double spacing[dim];
+  spacing[0] = 1.0 / (sz[0] - 1);
+  spacing[1] = 1.0 / (sz[1] - 1);
+  Ghost<dim,long int> ghost{2};
+  BOOST_TEST_MESSAGE("Test: exponential");
+  BOOST_TEST_MESSAGE("Init grid_dist_id ...");
+  grid_dist_id<dim,double,aggregate<double,double,double>> grid{sz,box,ghost,bc};
+  auto it{grid.getDomainIterator()};
+  while (it.isNext()) {
+    auto key = it.get();
+    grid.template get<0>(key) = std::exp(0);   // Initial state
+    grid.template get<1>(key) = std::exp(0.4); // Analytical solution
+    ++it;
+  }
+  grid.ghost_get<0>();
+  auto Init{FD::getV<0>(grid)};   // Initial state
+  auto Sol{FD::getV<1>(grid)};    // Analytical solution
+  auto OdeSol{FD::getV<2>(grid)}; // Numerical solution
+  state_type_1ode x0;
+<0>() = Init;
+  double t{0.0};
+  double tf{0.4};
+  const double dt{0.1};
+  boost::numeric::odeint::runge_kutta4<state_type_1ode,double,
+  				       state_type_1ode,double,
+  				       boost::numeric::odeint::vector_space_algebra_ofp> rk4; // Time integrator
+  size_t steps{boost::numeric::odeint::integrate_const(rk4,Exponential,x0,0.0,tf,dt)};
+  OdeSol =<0>(); // Numerical solution
+  // Error
+  auto it2{grid.getDomainIterator()};
+  double worst{0.0};
+  while (it2.isNext()) {
+    auto p{it2.get()};
+    if (std::fabs(grid.template get<1>(p) - grid.template get<2>(p)) > worst) {
+      worst = std::fabs(grid.template get<1>(p) - grid.template get<2>(p));
+    }
+    ++it2;
+  }
+  std::cout << worst << std::endl;
+  BOOST_REQUIRE(worst < 1e-6);
+  // Another way
+<0>() = Init;
+  while (t < tf) {
+    rk4.do_step(Exponential,x0,t,dt);
+    OdeSol =<0>();
+    t += dt;
+  }
+  OdeSol =<0>();
+  // Error
+  auto it3{grid.getDomainIterator()};
+  double worst2{0.0};
+  while (it3.isNext()) {
+    auto p{it3.get()};
+    if (std::fabs(grid.template get<1>(p) - grid.template get<2>(p)) > worst2) {
+      worst2 = fabs(grid.template get<1>(p) - grid.template get<2>(p));
+    }
+    ++it3;
+  }
+  std::cout << worst2 << std::endl;
+  BOOST_REQUIRE(worst2 < 1e-6);
+  BOOST_REQUIRE_EQUAL(worst,worst2);
+BOOST_AUTO_TEST_CASE(odeint_grid_test_STRUCT_exponential) {
+  size_t edgeSemiSize{40};
+  const size_t sz[dim] = {edgeSemiSize,edgeSemiSize};
+  Box<dim, double> box{{0.0,0.0},{1.0,1.0}};
+  periodicity<dim> bc{{NON_PERIODIC,NON_PERIODIC}};
+  double spacing[dim];
+  spacing[0] = 1.0 / (sz[0] - 1);
+  spacing[1] = 1.0 / (sz[1] - 1);
+  Ghost<dim,long int> ghost{2};
+  BOOST_TEST_MESSAGE("Test: exponential");
+  BOOST_TEST_MESSAGE("Init grid_dist_id ...");
+  grid_dist_id<dim,double,aggregate<double,double,double,double,double,double>> grid{sz,box,ghost,bc};
+  auto it{grid.getDomainIterator()};
+  while (it.isNext()) {
+    auto key = it.get();
+    grid.get<0>(key) = std::exp(0);
+    grid.template get<0>(key) = std::exp(0.0); // Initial state 1
+    grid.template get<1>(key) = std::exp(0.4); // Analytical solution 1
+    grid.template get<2>(key) = std::exp(0.0); // Initial state 2
+    grid.template get<3>(key) = std::exp(0.8); // Analytical solution 2
+    ++it;
+  }
+  grid.ghost_get<0>();
+  auto Init1{FD::getV<0>(grid)};   // Initial state 1
+  auto Sol1{FD::getV<1>(grid)};    // Analytical solution 1
+  auto Init2{FD::getV<2>(grid)};   // Initial state 2
+  auto Sol2{FD::getV<3>(grid)};    // Analytical solution 2
+  auto OdeSol1{FD::getV<4>(grid)}; // Numerical solution 1
+  auto OdeSol2{FD::getV<5>(grid)}; // Numerical solution 2
+  state_type_3ode x0;
+<0>() = Init1;
+<1>() = Init2;
+<2>() = Init1;
+  double t{0};
+  double tf{0.4};
+  const double dt{0.1};
+  // size_t steps{boost::numeric::odeint::integrate(Exponential_struct,x0,0.0,tf,dt)};
+  // size_t steps{boost::numeric::odeint::integrate_const(boost::numeric::odeint::runge_kutta4<state_type_3ode,double,state_type_3ode,double,boost::numeric::odeint::vector_space_algebra_ofp>(),
+  // 						       Exponential_struct_ofp,x0,0.0,tf,dt)};
+  typedef boost::numeric::odeint::controlled_runge_kutta<boost::numeric::odeint::runge_kutta_cash_karp54<state_type_3ode,double,state_type_3ode,double,boost::numeric::odeint::vector_space_algebra_ofp>> stepper_type;
+  integrate_adaptive(stepper_type(),Exponential_struct_ofp2,x0,t,tf,dt);
+  OdeSol1 =<0>();
+  OdeSol2 =<1>();
+  // Error
+  auto it2{grid.getDomainIterator()};
+  double worst{0.0};
+  double worst2{0.0};
+  while (it2.isNext()) {
+    auto p{it2.get()};
+    if (std::fabs(grid.getProp<1>(p) - grid.getProp<4>(p)) > worst)
+      worst = std::fabs(grid.getProp<1>(p) - grid.getProp<4>(p));
+    if (std::fabs(grid.getProp<3>(p) - grid.getProp<5>(p)) > worst2)
+      worst2 = std::fabs(grid.getProp<3>(p) - grid.getProp<5>(p));
+    ++it2;
+  }
+  std::cout << worst << " " << worst2 << std::endl;
+  BOOST_REQUIRE(worst < 1e-6);
+  BOOST_REQUIRE(worst2 < 1e-6);
+  // A different way
+<0>() = Init1;
+<1>() = Init2;
+<2>() = Init1;
+  boost::numeric::odeint::runge_kutta4<state_type_3ode,double,state_type_3ode,double,boost::numeric::odeint::vector_space_algebra_ofp> rk4;
+  while (t < tf) {
+    rk4.do_step(Exponential_struct_ofp2,x0,t,dt);
+    t+=dt;
+  }
+  OdeSol1 =<0>();
+  OdeSol2 =<1>();
+  // Error
+  auto it3{grid.getDomainIterator()};
+  double worst3{0.0};
+  double worst4{0.0};
+  while (it3.isNext()) {
+    auto p{it3.get()};
+    if (std::fabs(grid.getProp<1>(p) - grid.getProp<4>(p)) > worst3)
+      worst3 = std::fabs(grid.getProp<1>(p) - grid.getProp<4>(p));
+    if (std::fabs(grid.getProp<3>(p) - grid.getProp<5>(p)) > worst4)
+      worst4 = std::fabs(grid.getProp<3>(p) - grid.getProp<5>(p));
+    ++it3;
+  }
+  std::cout << worst3 << " " << worst4 << std::endl;
+  BOOST_REQUIRE(worst3 < 1e-6);
+  BOOST_REQUIRE(worst4 < 5e-5);
+BOOST_AUTO_TEST_CASE(odeint_grid_test2_exponential) {
+  size_t edgeSemiSize{40};
+  const size_t sz[dim] = {edgeSemiSize,edgeSemiSize};
+  Box<dim,double> box{{0.0,0.0}, {1.0,1.0}};
+  periodicity<dim> bc{{NON_PERIODIC,NON_PERIODIC}};
+  double spacing[dim];
+  spacing[0] = 1.0 / (sz[0] - 1);
+  spacing[1] = 1.0 / (sz[1] - 1);
+  Ghost<dim,long int> ghost{2};
+  BOOST_TEST_MESSAGE("Test: exponential");
+  BOOST_TEST_MESSAGE("Init grid_dist_id ...");
+  grid_dist_id<dim,double,aggregate<double,double,double>> grid{sz,box,ghost,bc};
+  double t{0.0};
+  double tf{0.5};
+  const double dt{0.1};
+  auto it{grid.getDomainIterator()};
+  while (it.isNext()) {
+    auto key = it.get();
+    grid.template get<0>(key) = std::exp(t);  // Initial state
+    grid.template get<1>(key) = std::exp(tf); // Analytical solution
+    ++it;
+  }
+  grid.ghost_get<0>();
+  auto Init{FD::getV<0>(grid)};   // Initial state
+  auto Sol{FD::getV<1>(grid)};    // Analytical solution
+  auto OdeSol{FD::getV<2>(grid)}; // Numerical solution
+  state_type_1ode x0;
+<0>() = Init;
+  typedef boost::numeric::odeint::controlled_runge_kutta<boost::numeric::odeint::runge_kutta_cash_karp54<state_type_1ode,double,state_type_1ode,double,boost::numeric::odeint::vector_space_algebra_ofp>> stepper_type;
+  integrate_adaptive(stepper_type(),Exponential,x0,t,tf,dt);
+  OdeSol =<0>(); // Numerical solution
+  // Error
+  auto it2{grid.getDomainIterator()};
+  double worst{0.0};
+  while (it2.isNext()) {
+    auto p{it2.get()};
+    if (std::fabs(grid.template get<1>(p) - grid.template get<2>(p)) > worst) {
+      worst = std::fabs(grid.template get<1>(p) - grid.template get<2>(p));
+    }
+    ++it2;
+  }
+  std::cout << worst << std::endl;
+  BOOST_REQUIRE(worst < 1e-6);
+  // Another way
+  boost::numeric::odeint::runge_kutta4<state_type_1ode,double,state_type_1ode,double,boost::numeric::odeint::vector_space_algebra_ofp> rk4;
+<0>() = Init;
+  for (size_t i = 0; i < int(tf/dt); ++i, t += dt) {
+    rk4.do_step(Exponential,x0,t,dt);
+    t += dt;
+  }
+  OdeSol =<0>();
+  // Error
+  auto it3{grid.getDomainIterator()};
+  double worst2{0.0};
+  while (it3.isNext()) {
+    auto p{it3.get()};
+    if (std::fabs(grid.template get<1>(p) - grid.template get<2>(p)) > worst2) {
+      worst2 = fabs(grid.template get<1>(p) - grid.template get<2>(p));
+    }
+    ++it3;
+  }
+  std::cout << worst2 << std::endl;
+  BOOST_REQUIRE(worst2 < 1e-6);
+  // Yet another way
+  //<0>() = Init;
+  // integrate(rk4,Exponential,x0,t,tf,dt);
+  // OdeSol =<0>();
+  // // Error
+  // auto it4{grid.getDomainIterator()};
+  // double worst3{0.0};
+  // while (it4.isNext()) {
+  //   auto p{it4.get()};
+  //   if (std::fabs(grid.template get<1>(p) - grid.template get<2>(p)) > worst3) {
+  //     worst3 = fabs(grid.template get<1>(p) - grid.template get<2>(p));
+  //   }
+  //   ++it4;
+  // }
+  // std::cout << worst3 << std::endl;
+  // BOOST_REQUIRE(worst3 < 1e-6);
+  // BOOST_REQUIRE_EQUAL(worst,worst2);
+  // BOOST_REQUIRE_EQUAL(worst2,worst3);
+// BOOST_AUTO_TEST_CASE(odeint_base_test3) 
+// {
+//     size_t edgeSemiSize = 40;
+//     const size_t sz[2] = {edgeSemiSize,edgeSemiSize };
+//     Box<2, double> box({ 0, 0 }, { 1.0, 1.0 });
+//     size_t bc[2] = {NON_PERIODIC,NON_PERIODIC};
+//     double spacing[2];
+//     spacing[0] = 1.0 / (sz[0] - 1);
+//     spacing[1] = 1.0 / (sz[1] - 1);
+//     double rCut = 3.9 * spacing[0];
+//     Ghost<2, double> ghost(rCut);
+//     BOOST_TEST_MESSAGE("Init vector_dist...");
+//     vector_dist<2, double, aggregate<double, double,double>> Particles(0, box, bc, ghost);
+//     double t=0.0,tf=0.5;
+//     const double dt=0.1;
+//     auto it = Particles.getGridIterator(sz);
+//     while (it.isNext())
+//     {
+//         Particles.add();
+//         auto key = it.get();
+//         mem_id k0 = key.get(0);
+//         double xp0 = k0 * spacing[0];
+//         Particles.getLastPos()[0] = xp0;
+//         mem_id k1 = key.get(1);
+//         double yp0 = k1 * spacing[1];
+//         Particles.getLastPos()[1] = yp0;
+//         Particles.getLastProp<0>() = 1.0/(1.0+exp(-t)); // Carefull in putting the constant, f = A*sigmoid does not respect f' = f*(1.0-f) but f*(1.0-f/A), for simplicity I remove the constant
+//         Particles.getLastProp<1>() = 1.0/(1.0+exp(-tf)); // Carefull in putting the constant, f = A*sigmoid does not respect f' = f*(1.0-f)  but f*(1.0-f/A), for simplicity I remove the constant
+//         ++it;
+//     }
+//     Particles.ghost_get<0>();
+//     auto Init = getV<0>(Particles);
+//     auto Sol = getV<1>(Particles);
+//     auto OdeSol = getV<2>(Particles);
+//     state_type x0;
+//     x0=Init;
+//     // The rhs of x' = f(x)
+//     //size_t steps=boost::numeric::odeint::integrate(sigmoid,x0,0.0,tf,dt);
+//     //typedef boost::numeric::odeint::controlled_runge_kutta< boost::numeric::odeint::runge_kutta_cash_karp54< state_type > > stepper_type;
+//     //integrate_adaptive( stepper_type() , sigmoid , x0 , t , tf , dt);
+//     size_t steps=boost::numeric::odeint::integrate_const( boost::numeric::odeint::runge_kutta4< state_type >(),sigmoid,x0,0.0,tf,dt);
+//     OdeSol=x0;
+//     auto it2 = Particles.getDomainIterator();
+//     double worst = 0.0;
+//     while (it2.isNext()) {
+//         auto p = it2.get();
+//         if (fabs(Particles.getProp<1>(p) - Particles.getProp<2>(p)) > worst) {
+//             worst = fabs(Particles.getProp<1>(p) - Particles.getProp<2>(p));
+//         }
+//         ++it2;
+//     }
+//     BOOST_REQUIRE(worst < 1e-8);
+//     x0=Init;
+//     boost::numeric::odeint::runge_kutta4< state_type > rk4;
+//     for( size_t i=0 ; i<int(tf/dt) ; ++i,t+=dt )
+//     {
+//         rk4.do_step(sigmoid,x0,t,dt);
+//         t+=dt;
+//     }
+//     OdeSol=x0;
+//     auto it3 = Particles.getDomainIterator();
+//     double worst2 = 0.0;
+//     while (it3.isNext()) {
+//         auto p = it3.get();
+//         if (fabs(Particles.getProp<1>(p) - Particles.getProp<2>(p)) > worst2) {
+//             worst2 = fabs(Particles.getProp<1>(p) - Particles.getProp<2>(p));
+//         }
+//         ++it3;
+//     }
+//     //std::cout<<worst2<<std::endl;
+//     BOOST_REQUIRE(worst < 1e-6);
+//     BOOST_REQUIRE_EQUAL(worst,worst2);
+// }
+#ifdef HAVE_EIGEN
+BOOST_AUTO_TEST_CASE(dcpse_op_react_diff_test) {
+  size_t edgeSemiSize{5};
+  const size_t sz[dim] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
+  Box<dim,double> box{{0.0, 0.0},{1.0, 1.0}};
+  periodicity<dim> bc{{PERIODIC,PERIODIC}};
+  double spacing[dim];
+  spacing[0] = 1.0 / (sz[0]);
+  spacing[1] = 1.0 / (sz[1]);
+  Ghost<dim,double> ghost{spacing[0] * 3};
+  BOOST_TEST_MESSAGE("Test: reaction diffusion");
+  BOOST_TEST_MESSAGE("Init grid_dist_id ...");
+  double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+  // properties: u, v, du, dv
+  grid_dist_id<dim,double,aggregate<double,double,double,double,double,double>> domain{sz,box,ghost,bc};
+  auto it{domain.getDomainIterator()};
+  while (it.isNext()) {
+    auto key{it.get()};
+    domain.get<0>(key) = 0.0; // u
+    domain.get<1>(key) = 0.0; // v
+    domain.get<2>(key) = 0.0; // du/dt
+    domain.get<3>(key) = 0.0; // dv/dt
+    auto gkey = it.getGKey(key);
+    if (gkey.get(0)==sz[0] / 2 && gkey.get(1) == sz[1]/2)
+    {
+      domain.get<0>(key) = 1.0;
+      domain.get<1>(key) = 1.0;
+    }
+    ++it;
+  }
+  domain.ghost_get<0>();
+  FD::Derivative<0,2,2,FD::CENTRAL> ddx;
+  FD::Derivative<1,2,2,FD::CENTRAL> ddy;
+  gridGlobal=(void *) & domain;
+  auto u{FD::getV<0>(domain)};
+  auto v{FD::getV<1>(domain)};
+  auto fu{FD::getV<2>(domain)};
+  auto fv{FD::getV<3>(domain)};
+  Fitz<decltype(ddx),decltype(ddy)> system(ddx,ddy);
+  state_type_2ode x0;
+<0>() = u;
+<1>() = v;
+  double dt{0.001};
+  double t{0.0};
+  double tf{10.5};
+  //typedef boost::numeric::odeint::controlled_runge_kutta< boost::numeric::odeint::runge_kutta_cash_karp54< state_type_2d_ofp,double,state_type_2d_ofp,double,boost::numeric::odeint::vector_space_algebra_ofp>> stepper_type;
+  typedef boost::numeric::odeint::runge_kutta4<state_type_2ode,double,state_type_2ode,double,boost::numeric::odeint::vector_space_algebra_ofp> stepper_type;
+  integrate_adaptive(stepper_type(),system,x0,t,tf,dt);
+  fu =<0>();
+  fv =<1>();
+  domain.ghost_get<2,3>();
+  u = ddx(fu) + ddy(fu);
+  v = ddx(fv) + ddy(fv);
+  auto it2{domain.getDomainIterator()};
+  if (create_vcluster().rank() == 0)
+    ++it2;
+  while (it2.isNext()) {
+    auto p{it2.get()};
+    BOOST_REQUIRE_CLOSE(domain.get<0>(p),-1.0,1);   
+    ++it2;
+  }
\ No newline at end of file
diff --git a/src/OdeIntegrators/tests/OdeIntegratores_base_tests.cpp b/src/OdeIntegrators/tests/OdeIntegratores_base_tests.cpp
index 9ff837a3..af12bcc6 100644
--- a/src/OdeIntegrators/tests/OdeIntegratores_base_tests.cpp
+++ b/src/OdeIntegrators/tests/OdeIntegratores_base_tests.cpp
@@ -15,8 +15,10 @@
 #include "Vector/vector_dist_subset.hpp"
 #include "Decomposition/Distribution/SpaceDistribution.hpp"
 #include "OdeIntegrators/OdeIntegrators.hpp"
+#ifdef HAVE_EIGEN
 #include "DCPSE/DCPSE_op/DCPSE_op.hpp"
-#include "OdeIntegrators/boost_vector_algebra_ofp.hpp"
+#include "OdeIntegrators/vector_algebra_ofp.hpp"
 typedef texp_v<double> state_type;
 const double a = 2.8e-4;
@@ -93,7 +95,7 @@ void sigmoid( const state_type &x , state_type &dxdt , const double t )
         size_t edgeSemiSize = 40;
         const size_t sz[2] = {edgeSemiSize,edgeSemiSize };
@@ -180,7 +182,7 @@ BOOST_AUTO_TEST_CASE(odeint_base_test1)
     size_t edgeSemiSize = 40;
     const size_t sz[2] = {edgeSemiSize,edgeSemiSize };
@@ -293,7 +295,7 @@ BOOST_AUTO_TEST_CASE(odeint_base_test_STRUCT_ofp)
     size_t edgeSemiSize = 40;
     const size_t sz[2] = {edgeSemiSize,edgeSemiSize };
@@ -381,7 +383,7 @@ BOOST_AUTO_TEST_CASE(odeint_base_test2)
     BOOST_REQUIRE(worst2 < 1e-6);
     size_t edgeSemiSize = 40;
     const size_t sz[2] = {edgeSemiSize,edgeSemiSize };
@@ -464,9 +466,7 @@ BOOST_AUTO_TEST_CASE(odeint_base_test3)
 #ifdef HAVE_EIGEN
 BOOST_AUTO_TEST_CASE(dcpse_op_react_diff_test) {
         size_t edgeSemiSize = 5;
         const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
@@ -495,7 +495,7 @@ BOOST_AUTO_TEST_CASE(dcpse_op_react_diff_test) {
         size_t pointId = 0;
         size_t counter = 0;
         double minNormOne = 999;
-        while (it.isNext()) 
+        while (it.isNext())
             auto key = it.get();
@@ -558,7 +558,7 @@ BOOST_AUTO_TEST_CASE(dcpse_op_react_diff_test) {
         if (create_vcluster().rank() == 0)
-        while (it2.isNext()) 
+        while (it2.isNext())
             auto p = it2.get();
@@ -568,5 +568,4 @@ BOOST_AUTO_TEST_CASE(dcpse_op_react_diff_test) {
diff --git a/src/OdeIntegrators/tests/ b/src/OdeIntegrators/tests/
new file mode 100644
index 00000000..a1b2771d
--- /dev/null
+++ b/src/OdeIntegrators/tests/
@@ -0,0 +1,104 @@
+// Created by abhinav on 2/28/23.
+#include "config.h"
+#include <type_traits>
+#include <cstring>
+#include "util/common.hpp"
+#include "util/util_debug.hpp"
+#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include "Operators/Vector/vector_dist_operators.hpp"
+#include "OdeIntegrators/OdeIntegrators.hpp"
+//#include "DCPSE/DCPSE_op/DCPSE_op.hpp"
+#ifdef __NVCC__
+typedef state_type_1d_ofp_gpu state_type;
+//const double a = 2.8e-4;
+//const double b = 5e-3;
+//const double tau = .1;
+//const double k = .005;
+void ExponentialGPU( const state_type &x , state_type &dxdt , const double t )
+<0>() =<0>();
+    //<0>().getVector().deviceToHost<0>();
+    //<0>().getVector().deviceToHost<0>();
+        {
+        size_t edgeSemiSize = 512;
+        const size_t sz[2] = {edgeSemiSize,edgeSemiSize };
+        Box<2, double> box({ 0, 0 }, { 1.0, 1.0 });
+        size_t bc[2] = { NON_PERIODIC, NON_PERIODIC };
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        vector_dist_gpu<2, double, aggregate<double, double,double>> Particles(0, box, bc, ghost);
+        auto it = Particles.getGridIterator(sz);
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double xp0 = k0 * spacing[0];
+            Particles.getLastPos()[0] = xp0;
+            mem_id k1 = key.get(1);
+            double yp0 = k1 * spacing[1];
+            Particles.getLastPos()[1] = yp0;
+            Particles.getLastProp<0>() = xp0*yp0*exp(-5);
+            Particles.getLastProp<1>() = xp0*yp0*exp(5);
+            ++it;
+        }
+        Particles.ghost_get<0>();
+        Particles.hostToDeviceProp<0,1,2>();
+        auto Init = getV<0,comp_dev>(Particles);
+        auto Sol = getV<1,comp_dev>(Particles);
+        auto OdeSol = getV<2,comp_dev>(Particles);
+        state_type x0;
+        // The rhs of x' = f(x)
+        double t0=-5,tf=5;
+        const double dt=0.01;
+        //This doesnt work Why?
+        //size_t steps=boost::numeric::odeint::integrate(Exponential,x0,0.0,tf,dt);
+        timer tt;
+        tt.start();
+        size_t steps=boost::numeric::odeint::integrate_const( boost::numeric::odeint::runge_kutta4< state_type, double, state_type, double, boost::numeric::odeint::vector_space_algebra_ofp_gpu,boost::numeric::odeint::ofp_operations>(),ExponentialGPU,x0,t0,tf,dt);
+        tt.stop();
+        Particles.deviceToHostProp<0,1,2>();
+        auto it2 = Particles.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(Particles.getProp<1>(p) - Particles.getProp<2>(p)) > worst) {
+                worst = fabs(Particles.getProp<1>(p) - Particles.getProp<2>(p));
+            }
+            ++it2;
+        }
+        std::cout<<"WCT:"<<tt.getwct()<<std::endl;
+        std::cout<<"CPU:"<<tt.getcputime()<<std::endl;
+        std::cout<<worst<<std::endl;
+        BOOST_REQUIRE(worst < 1e-6);
+        }
\ No newline at end of file
diff --git a/src/OdeIntegrators/boost_vector_algebra_ofp.hpp b/src/OdeIntegrators/vector_algebra_ofp.hpp
similarity index 86%
rename from src/OdeIntegrators/boost_vector_algebra_ofp.hpp
rename to src/OdeIntegrators/vector_algebra_ofp.hpp
index 26447bea..5c01e81c 100644
--- a/src/OdeIntegrators/boost_vector_algebra_ofp.hpp
+++ b/src/OdeIntegrators/vector_algebra_ofp.hpp
@@ -2,126 +2,40 @@
 // Created by Abhinav Singh on 18.02.21.
 namespace boost {
     namespace numeric {
         namespace odeint {
- * This class template has to be overload in order to call vector_space_algebra::norm_inf
- */
- //           template< class State, class Enabler = void > struct vector_space_norm_inf;
- * Example: instantiation for sole doubles and complex
- */
-/*            template<>
-            struct vector_space_norm_inf< double >
+            /* It copy one element of the chunk for each property
+        *
+        */
+            template<typename vector_type,typename index_type,typename op_type>
+            struct for_each_prop1
-                typedef double result_type;
-                double operator()( double x ) const
-                {
-                    using std::abs;
-                    return abs(x);
-                }
-            };
-            template<>
-            struct vector_space_norm_inf< float >
-            {
-                typedef float result_type;
-                result_type operator()( float x ) const
+                vector_type &v;
+                index_type &p;
+                op_type &op;
+                /*! \brief constructor
+                 *
+                 *
+                 * \param src source encapsulated object
+                 * \param dst destination encapsulated object
+                 *
+                 */
+                __device__ __host__ inline for_each_prop1(vector_type &v,index_type &p,op_type &op)
+                        :v(v),p(p),op(op)
+                {};
+                //! It call the copy function for each property
+                template<typename T>
+                __device__ __host__ inline void operator()(T& t) const
-                    using std::abs;
-                    return abs(x);
-                }
-            };
-            template< typename T >
-            struct vector_space_norm_inf< std::complex<T> >
-        {
-            typedef T result_type;
-            result_type operator()( std::complex<T> x ) const
-            {
-                using std::abs;
-                return abs( x );
-            }
-        };*/
-        template<typename S1,typename S2>
-        struct for_each_prop_resize{
-            S1 &v1;
-            S2 &v2;
-            /*! \brief constructor
-             *
-             *
-             * \param src source encapsulated object
-             * \param dst destination encapsulated object
-             *
-             */
-            inline for_each_prop_resize(S1 &v1,S2 &v2)
-            :v1(v1),v2(v2)
-            {};
-            //! It call the copy function for each property
-            template<typename T>
-            inline void operator()(T& t) const
-            {
-       get<T::value>().getVector().resize( get<T::value>().getVector().size());
-            }
-        };
-        /* It copy one element of the chunk for each property
-         *
-         */
-        template<typename vector_type,typename index_type,typename op_type>
-        struct for_each_prop1
-        {
-            vector_type &v;
-            index_type &p;
-            op_type &op;
-            /*! \brief constructor
-             *
-             *
-             * \param src source encapsulated object
-             * \param dst destination encapsulated object
-             *
-             */
-            inline for_each_prop1(vector_type &v,index_type &p,op_type &op)
-            :v(v),p(p),op(op)
-            {};
-            //! It call the copy function for each property
-            template<typename T>
-            inline void operator()(T& t) const
-            {
-                op( get<T::value>().getVector().template get<0>(p));
-            }
-        };
-        struct vector_space_algebra_ofp
-        {
-            template< class S1 , class Op >
-            static void for_each1( S1 &s1 , Op op )
-            {
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop1<S1,size_t,Op> cp(s1,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
+                    op( get<T::value>().getVector().template get<0>(p));
-            }
+            };
             template<typename S1,typename S2,typename index_type,typename op_type>
             struct for_each_prop2
@@ -137,36 +51,16 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop2(S1 &v1,S2 &v2,index_type &p,op_type &op)
-                :v1(v1),v2(v2),p(p),op(op)
+                __device__ __host__ inline for_each_prop2(S1 &v1,S2 &v2,index_type &p,op_type &op)
+                        :v1(v1),v2(v2),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class Op >
-            static void for_each2( S1 &s1 , S2 &s2 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // get<0>().getVector().resize( get<0>().getVector().size());
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop2<S1,S2,size_t,Op> cp(s1,s2,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename index_type,typename op_type>
             struct for_each_prop3
@@ -184,40 +78,21 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop3(S1 &v1,S2 &v2,S3 &v3,index_type &p,op_type &op)
-                :v1(v1),v2(v2),v3(v3),p(p),op(op)
+                __device__ __host__ inline for_each_prop3(S1 &v1,S2 &v2,S3 &v3,index_type &p,op_type &op)
+                        :v1(v1),v2(v2),v3(v3),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     //std::cout<< get<T::value>().getVector().size()<<":"<< get<T::value>().getVector().size()<<":"<< get<T::value>().getVector().size()<<std::endl;
+                    //printf("v2:%f,v3:%f \n", get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
+                    //printf("2\n");
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-                }
-            };
-            template< class S1 , class S2 , class S3 , class Op >
-            static void for_each3( S1 &s1 , S2 &s2 , S3 &s3 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop3<S1,S2,S3,size_t,Op> cp(s1,s2,s3,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    //printf("v1:%f, v2:%f,v3:%f \n", get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-                    ++it;
-            }
+            };
             template<typename S1,typename S2,typename S3,typename S4,typename index_type,typename op_type>
             struct for_each_prop4
@@ -236,36 +111,16 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop4(S1 &v1,S2 &v2,S3 &v3,S4 &v4,index_type &p,op_type &op)
-                :v1(v1),v2(v2),v3(v3),v4(v4),p(p),op(op)
+                __device__ __host__ inline for_each_prop4(S1 &v1,S2 &v2,S3 &v3,S4 &v4,index_type &p,op_type &op)
+                        :v1(v1),v2(v2),v3(v3),v4(v4),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4 , class Op >
-            static void for_each4( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop4<S1,S2,S3,S4,size_t,Op> cp(s1,s2,s3,s4,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename index_type,typename op_type>
             struct for_each_prop5
@@ -285,36 +140,17 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop5(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop5(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5 , class Op >
-            static void for_each5( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop5<S1,S2,S3,S4,S5,size_t,Op> cp(s1,s2,s3,s4,s5,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename index_type,typename op_type>
             struct for_each_prop6
@@ -336,34 +172,16 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop6(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop6(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 , class Op >
-            static void for_each6( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop6<S1,S2,S3,S4,S5,S6,size_t,Op> cp(s1,s2,s3,s4,s5,s6,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename index_type,typename op_type>
@@ -388,36 +206,17 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop7(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop7(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7, class Op >
-            static void for_each7( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop7<S1,S2,S3,S4,S5,S6,S7,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename index_type,typename op_type>
             struct for_each_prop8
@@ -441,36 +240,17 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop8(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,index_type &p,op_type &op)
+                __device__ __host__  inline for_each_prop8(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class Op >
-            static void for_each8( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop8<S1,S2,S3,S4,S5,S6,S7,S8,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9,typename index_type,typename op_type>
             struct for_each_prop9
@@ -495,38 +275,19 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop9(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop9(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class Op >
-            static void for_each9( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop9<S1,S2,S3,S4,S5,S6,S7,S8,S9,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
-            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10,typename index_type,typename op_type>
-            struct for_each_prop10
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10,typename index_type,typename op_type>
+            struct for_each_prop10
                 S1 &v1;
@@ -550,36 +311,17 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop10(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10,index_type &p,op_type &op)
+                __device__ __host__  inline for_each_prop10(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class Op >
-            static void for_each10( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10, Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop10<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10, typename S11,typename index_type,typename op_type>
             struct for_each_prop11
@@ -606,36 +348,18 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop11(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop11(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,index_type &p,op_type &op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class Op >
-            static void for_each11( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11, Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop11<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10, typename S11, typename S12,typename index_type,typename op_type>
             struct for_each_prop12
@@ -663,36 +387,18 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop12(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop12(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,index_type &p,op_type &op)
                         :v1(v1),v2(v2),v3(v3),v4(v4),v5(v5),v6(v6),v7(v7),v8(v8),v9(v9),v10(v10),v11(v11), v12(v12),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class Op >
-            static void for_each12( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11,S12 &s12, Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop12<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10, typename S11, typename S12, typename S13,typename index_type,typename op_type>
             struct for_each_prop13
@@ -721,36 +427,18 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop13(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,S13 &v13,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop13(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,S13 &v13,index_type &p,op_type &op)
                         :v1(v1),v2(v2),v3(v3),v4(v4),v5(v5),v6(v6),v7(v7),v8(v8),v9(v9),v10(v10),v11(v11), v12(v12),v13(v13),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class Op >
-            static void for_each13( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11,S12 &s12,S13 &s13, Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop13<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10, typename S11, typename S12, typename S13, typename S14,typename index_type,typename op_type>
             struct for_each_prop14
@@ -780,36 +468,18 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop14(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,S13 &v13,S14 &v14,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop14(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,S13 &v13,S14 &v14,index_type &p,op_type &op)
                         :v1(v1),v2(v2),v3(v3),v4(v4),v5(v5),v6(v6),v7(v7),v8(v8),v9(v9),v10(v10),v11(v11), v12(v12),v13(v13),v14(v14),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
-            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class S14, class Op >
-            static void for_each14( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9, S10 &s10,S11 &s11,S12 &s12,S13 &s13,S14 &s14, Op op )
-            {
-                for_each_prop_resize<S1,S2> the_resize(s1,s2);
-                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
-                // ToDo : build checks, that the +-*/ operators are well defined
-                auto get<0>().getVector().getIterator();
-                while(it.isNext()){
-                    auto p=it.get();
-                    //converting to boost vector ids.
-                    for_each_prop14<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,p,op);
-                    //creating an iterator on v_ids[0] [1] [2]
-                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
-                    ++it;
-                }
-            }
             template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename S9, typename S10, typename S11, typename S12, typename S13, typename S14, typename S15,typename index_type,typename op_type>
             struct for_each_prop15
@@ -840,17 +510,349 @@ namespace boost {
                  * \param dst destination encapsulated object
-                inline for_each_prop15(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,S13 &v13,S14 &v14,S15 &v15,index_type &p,op_type &op)
+                __device__ __host__ inline for_each_prop15(S1 &v1,S2 &v2,S3 &v3,S4 &v4,S5 &v5,S6 &v6,S7 &v7,S8 &v8,S9 &v9,S10 &v10, S11 &v11,S12 &v12,S13 &v13,S14 &v14,S15 &v15,index_type &p,op_type &op)
                         :v1(v1),v2(v2),v3(v3),v4(v4),v5(v5),v6(v6),v7(v7),v8(v8),v9(v9),v10(v10),v11(v11), v12(v12),v13(v13),v14(v14),v15(v15),p(p),op(op)
                 //! It call the copy function for each property
                 template<typename T>
-                inline void operator()(T& t) const
+                __device__ __host__ inline void operator()(T& t) const
                     op( get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p), get<T::value>().getVector().template get<0>(p));
+ * This class template has to be overload in order to call vector_space_algebra::norm_inf
+ */
+ //           template< class State, class Enabler = void > struct vector_space_norm_inf;
+ * Example: instantiation for sole doubles and complex
+ */
+/*            template<>
+            struct vector_space_norm_inf< double >
+            {
+                typedef double result_type;
+                double operator()( double x ) const
+                {
+                    using std::abs;
+                    return abs(x);
+                }
+            };
+            template<>
+            struct vector_space_norm_inf< float >
+            {
+                typedef float result_type;
+                result_type operator()( float x ) const
+                {
+                    using std::abs;
+                    return abs(x);
+                }
+            };
+            template< typename T >
+            struct vector_space_norm_inf< std::complex<T> >
+        {
+            typedef T result_type;
+            result_type operator()( std::complex<T> x ) const
+            {
+                using std::abs;
+                return abs( x );
+            }
+        };*/
+        template<typename S1,typename S2>
+        struct for_each_prop_resize{
+            S1 &v1;
+            S2 &v2;
+            /*! \brief constructor
+             *
+             *
+             * \param src source encapsulated object
+             * \param dst destination encapsulated object
+             *
+             */
+            inline for_each_prop_resize(S1 &v1,S2 &v2)
+            :v1(v1),v2(v2)
+            {};
+            //! It call the copy function for each property
+            template<typename T>
+            inline void operator()(T& t) const
+            {
+       get<T::value>().getVector().resize( get<T::value>().getVector().size());
+            }
+        };
+        struct vector_space_algebra_ofp
+        {
+            template< class S1 , class Op >
+            static void for_each1( S1 &s1 , Op op )
+            {
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop1<S1,typename S1::index_type,Op> cp(s1,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class Op >
+            static void for_each2( S1 &s1 , S2 &s2 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // get<0>().getVector().resize( get<0>().getVector().size());
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop2<S1,S2,typename S1::index_type,Op> cp(s1,s2,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class Op >
+            static void for_each3( S1 &s1 , S2 &s2 , S3 &s3 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop3<S1,S2,S3,typename S1::index_type,Op> cp(s1,s2,s3,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4 , class Op >
+            static void for_each4( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop4<S1,S2,S3,S4,typename S1::index_type,Op> cp(s1,s2,s3,s4,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5 , class Op >
+            static void for_each5( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop5<S1,S2,S3,S4,S5,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 , class Op >
+            static void for_each6( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop6<S1,S2,S3,S4,S5,S6,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7, class Op >
+            static void for_each7( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop7<S1,S2,S3,S4,S5,S6,S7,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class Op >
+            static void for_each8( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop8<S1,S2,S3,S4,S5,S6,S7,S8,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class Op >
+            static void for_each9( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop9<S1,S2,S3,S4,S5,S6,S7,S8,S9,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class Op >
+            static void for_each10( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop10<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class Op >
+            static void for_each11( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop11<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class Op >
+            static void for_each12( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11,S12 &s12, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop12<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class Op >
+            static void for_each13( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11,S12 &s12,S13 &s13, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop13<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class S14, class Op >
+            static void for_each14( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9, S10 &s10,S11 &s11,S12 &s12,S13 &s13,S14 &s14, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_prop14<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,p,op);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+            }
             template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class S14, class S15, class Op >
             static void for_each15( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9, S10 &s10,S11 &s11,S12 &s12,S13 &s13,S14 &s14,S15 &s15, Op op )
@@ -862,7 +864,7 @@ namespace boost {
                     auto p=it.get();
                     //converting to boost vector ids.
-                    for_each_prop15<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,S15,size_t,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,p,op);
+                    for_each_prop15<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,S15,typename S1::index_type,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,p,op);
                     //creating an iterator on v_ids[0] [1] [2]
@@ -911,7 +913,7 @@ namespace boost {
                     auto p=it.get();
                     //converting to boost vector ids.
-                    for_each_norm<S,size_t,typename boost::numeric::odeint::vector_space_norm_inf< S >::result_type> cp(s,p,n);
+                    for_each_norm<S,typename S::index_type,typename boost::numeric::odeint::vector_space_norm_inf< S >::result_type> cp(s,p,n);
                     //creating an iterator on v_ids[0] [1] [2]
@@ -939,4 +941,4 @@ namespace boost {
diff --git a/src/OdeIntegrators/vector_algebra_ofp_gpu.hpp b/src/OdeIntegrators/vector_algebra_ofp_gpu.hpp
new file mode 100644
index 00000000..5893bc3b
--- /dev/null
+++ b/src/OdeIntegrators/vector_algebra_ofp_gpu.hpp
@@ -0,0 +1,993 @@
+// Created by Abhinav Singh on 1.03.23.
+namespace boost {
+    namespace numeric {
+        namespace odeint {
+            template<typename S1, typename Op>
+            __global__ void for_each1_ker(S1 s1, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >=	{return;}
+                for_each_prop1<S1,size_t,Op> cp(s1,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2, typename Op>
+            __global__ void for_each2_ker(S1 s1,S2 s2, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //printf("%f \n", get<0>().getVector().template get<0>(p));
+                //converting to boost vector ids.
+                for_each_prop2<S1,S2,unsigned int,Op> cp(s1,s2,p,op);
+                // get<0>().getVector().template get<0>(p)=1.0* get<0>().getVector().template get<0>(p)+0.05* get<0>().getVector().template get<0>(p);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3, typename Op>
+            __global__ void for_each3_ker(S1 s1,S2 s2,S3 s3, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //printf("%f \n", get<0>().getVector().template get<0>(p));
+                //converting to boost vector ids.
+                for_each_prop3<S1,S2,S3,unsigned int,Op> cp(s1,s2,s3,p,op);
+                // get<0>().getVector().template get<0>(p)=1.0* get<0>().getVector().template get<0>(p)+0.05* get<0>().getVector().template get<0>(p);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4, typename Op>
+            __global__ void for_each4_ker(S1 s1,S2 s2,S3 s3,S4 s4, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //printf("%f \n", get<0>().getVector().template get<0>(p));
+                //converting to boost vector ids.
+                for_each_prop4<S1,S2,S3,S4,unsigned int,Op> cp(s1,s2,s3,s4,p,op);
+                // get<0>().getVector().template get<0>(p)=1.0* get<0>().getVector().template get<0>(p)+0.05* get<0>().getVector().template get<0>(p);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5, typename Op>
+            __global__ void for_each5_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //printf("%f \n", get<0>().getVector().template get<0>(p));
+                //converting to boost vector ids.
+                for_each_prop5<S1,S2,S3,S4,S5,unsigned int,Op> cp(s1,s2,s3,s4,s5,p,op);
+                // get<0>().getVector().template get<0>(p)=1.0* get<0>().getVector().template get<0>(p)+0.05* get<0>().getVector().template get<0>(p);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6, typename Op>
+            __global__ void for_each6_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop6<S1,S2,S3,S4,S5,S6,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7, typename Op>
+            __global__ void for_each7_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop7<S1,S2,S3,S4,S5,S6,S7,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8, typename Op>
+            __global__ void for_each8_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop8<S1,S2,S3,S4,S5,S6,S7,S8,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9, typename Op>
+            __global__ void for_each9_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop9<S1,S2,S3,S4,S5,S6,S7,S8,S9,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9,typename S10, typename Op>
+            __global__ void for_each10_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9,S10 s10, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop10<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9,typename S10,typename S11, typename Op>
+            __global__ void for_each11_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9,S10 s10,S11 s11, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop11<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9,typename S10,typename S11,typename S12, typename Op>
+            __global__ void for_each12_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9,S10 s10,S11 s11,S12 s12, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop12<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9,typename S10,typename S11,typename S12,typename S13, typename Op>
+            __global__ void for_each13_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9,S10 s10,S11 s11,S12 s12,S13 s13, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop13<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9,typename S10,typename S11,typename S12,typename S13,typename S14, typename Op>
+            __global__ void for_each14_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9,S10 s10,S11 s11,S12 s12,S13 s13,S14 s14, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop14<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+            template<typename S1,typename S2,typename S3,typename S4,typename S5,typename S6,typename S7,typename S8,typename S9,typename S10,typename S11,typename S12,typename S13,typename S14,typename S15, typename Op>
+            __global__ void for_each15_ker(S1 s1,S2 s2,S3 s3,S4 s4,S5 s5,S6 s6,S7 s7,S8 s8,S9 s9,S10 s10,S11 s11,S12 s12,S13 s13,S14 s14,S15 s15, Op op)
+            {
+                unsigned int p = threadIdx.x + blockIdx.x * blockDim.x;
+                if (p >= get<0>().size())	{return;}
+                //converting to boost vector ids.
+                for_each_prop15<S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,S15,unsigned int,Op> cp(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,p,op);
+                //creating an iterator on v_ids[0] [1] [2]
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+            }
+        struct vector_space_algebra_ofp_gpu
+        {
+            template< class S1 , class Op >
+            static void for_each1( S1 &s1 , Op op )
+            {
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getDomainIteratorGPU();
+                CUDA_LAUNCH((for_each1_ker),it,s1,op);
+            }
+            template< class S1 , class S2 , class Op >
+            static void for_each2( S1 &s1 , S2 &s2 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // get<0>().getVector().resize( get<0>().getVector().size());
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each2_ker),it,s1.toKernel(),s2.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class Op >
+            static void for_each3( S1 &s1 , S2 &s2 , S3 &s3 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each3_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4 , class Op >
+            static void for_each4( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each4_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5 , class Op >
+            static void for_each5( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each5_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 , class Op >
+            static void for_each6( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each6_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7, class Op >
+            static void for_each7( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each7_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class Op >
+            static void for_each8( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each8_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class Op >
+            static void for_each9( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each9_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),s9.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class Op >
+            static void for_each10( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each10_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),s9.toKernel(),s10.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class Op >
+            static void for_each11( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each11_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class Op >
+            static void for_each12( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11,S12 &s12, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each12_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),s9.toKernel(),s10.toKernel(),s11.toKernel(),s12.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class Op >
+            static void for_each13( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9 , S10 &s10,S11 &s11,S12 &s12,S13 &s13, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each13_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),s9.toKernel(),s10.toKernel(),s11.toKernel(),s12.toKernel(),s13.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class S14, class Op >
+            static void for_each14( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9, S10 &s10,S11 &s11,S12 &s12,S13 &s13,S14 &s14, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                // ToDo : build checks, that the +-*/ operators are well defined
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each14_ker),it,it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),s9.toKernel(),s10.toKernel(),s11.toKernel(),s12.toKernel(),s13.toKernel(),s14.toKernel(),op);
+            }
+            template< class S1 , class S2 , class S3 , class S4,class S5,class S6 ,class S7,class S8, class S9, class S10, class S11, class S12, class S13, class S14, class S15, class Op >
+            static void for_each15( S1 &s1 , S2 &s2 , S3 &s3 , S4 &s4,S5 &s5,S6 &s6,S7 &s7,S8 &s8, S9 &s9, S10 &s10,S11 &s11,S12 &s12,S13 &s13,S14 &s14,S15 &s15, Op op )
+            {
+                for_each_prop_resize<S1,S2> the_resize(s1,s2);
+                boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(the_resize);
+                auto get<0>().getVector().getGPUIterator();
+                CUDA_LAUNCH((for_each15_ker),it,s1.toKernel(),s2.toKernel(),s3.toKernel(),s4.toKernel(),s5.toKernel(),s6.toKernel(),s7.toKernel(),s8.toKernel(),s9.toKernel(),s10.toKernel(),s11.toKernel(),s12.toKernel(),s13.toKernel(),s14.toKernel(),s15.toKernel(),op);
+            }
+           template<typename vector_type,typename index_type,typename norm_result_type>
+           struct for_each_norm
+           {
+               const vector_type &v;
+               index_type &p;
+               norm_result_type &n;
+               /*! \brief constructor
+                *
+                *
+                * \param src source encapsulated object
+                * \param dst destination encapsulated object
+                *
+                */
+               inline for_each_norm(const vector_type &v,index_type &p,norm_result_type &n)
+               :v(v),p(p),n(n)
+               {};
+               //! It call the copy function for each property
+               template<typename T>
+               inline void operator()(T& t) const
+               {
+                    if(fabs( get<T::value>().getVector().template get<0>(p)) > n)
+                    {
+                        n=fabs( get<T::value>().getVector().template get<0>(p));
+                    }
+               }
+           };
+            template< class S >
+            static typename boost::numeric::odeint::vector_space_norm_inf< S >::result_type norm_inf( const S &s )
+            {
+                typename boost::numeric::odeint::vector_space_norm_inf< S >::result_type n=0;
+                auto get<0>().getVector().getIterator();
+                while(it.isNext()){
+                    auto p=it.get();
+                    //converting to boost vector ids.
+                    for_each_norm<S,size_t,typename boost::numeric::odeint::vector_space_norm_inf< S >::result_type> cp(s,p,n);
+                    //creating an iterator on v_ids[0] [1] [2]
+                    boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(>>(cp);
+                    ++it;
+                }
+                auto &v_cl = create_vcluster();
+                v_cl.max(n);
+                v_cl.execute();
+                //std::max();
+                //std::cout<<n<<std::endl;
+                return n;
+            }
+        };
+#include <algorithm>
+#include <boost/config.hpp>
+#include <boost/array.hpp>
+#include <boost/numeric/odeint/util/unit_helper.hpp>
+ * Notes:
+ *
+ * * the results structs are needed in order to work with fusion_algebra
+ */
+struct ofp_operations
+    template< class Fac1 = double >
+    struct scale
+    {
+        const Fac1 m_alpha1;
+        scale( Fac1 alpha1 ) : m_alpha1( alpha1 ) { }
+        template< class T1 >
+ __device__ __host__       void operator()( T1 &t1 ) const
+        {
+            t1 *= m_alpha1;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double >
+    struct scale_sum1
+    {
+        const Fac1 m_alpha1;
+        scale_sum1( Fac1 alpha1 ) : m_alpha1( alpha1 ) { }
+        template< class T1 , class T2 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 ) const
+        {
+            t1 = m_alpha1 * t2;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 >
+    struct scale_sum2
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        scale_sum2( Fac1 alpha1 , Fac2 alpha2 ) : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) { }
+        template< class T1 , class T2 , class T3 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 >
+    struct scale_sum3
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        scale_sum3( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) { }
+        template< class T1 , class T2 , class T3 , class T4 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 >
+    struct scale_sum4
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        scale_sum4( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 >
+    struct scale_sum5
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        scale_sum5( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 , Fac5 alpha5 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 >
+    struct scale_sum6
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        scale_sum6( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 , Fac5 alpha5 , Fac6 alpha6 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ){ }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 ,const T7 &t7) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 >
+    struct scale_sum7
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        scale_sum7( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 >
+    struct scale_sum8
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        scale_sum8( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 , class Fac9 = Fac8 >
+    struct scale_sum9
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        const Fac9 m_alpha9;
+        scale_sum9( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 , Fac9 alpha9 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) , m_alpha9( alpha9 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 , class T10 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 , const T10 &t10 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9 + m_alpha9 * t10;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 , class Fac9 = Fac8 , class Fac10 = Fac9 >
+    struct scale_sum10
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        const Fac9 m_alpha9;
+        const Fac10 m_alpha10;
+        scale_sum10( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 , Fac9 alpha9 , Fac10 alpha10 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) , m_alpha9( alpha9 ) , m_alpha10( alpha10 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 , class T10 , class T11 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 , const T10 &t10 , const T11 &t11 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9 + m_alpha9 * t10 + m_alpha10 * t11;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 , class Fac9 = Fac8 , class Fac10 = Fac9 , class Fac11 = Fac10 >
+    struct scale_sum11
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        const Fac9 m_alpha9;
+        const Fac10 m_alpha10;
+        const Fac11 m_alpha11;
+        scale_sum11( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 , Fac9 alpha9 ,
+                Fac10 alpha10 , Fac11 alpha11 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) , m_alpha9( alpha9 ) , m_alpha10( alpha10 ) , m_alpha11( alpha11 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 , class T10 , class T11 , class T12 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 , const T10 &t10 , const T11 &t11 , const T12 &t12 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9 + m_alpha9 * t10 + m_alpha10 * t11 + m_alpha11 * t12;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 , class Fac9 = Fac8 , class Fac10 = Fac9 , class Fac11 = Fac10 , class Fac12 = Fac11 >
+    struct scale_sum12
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        const Fac9 m_alpha9;
+        const Fac10 m_alpha10;
+        const Fac11 m_alpha11;
+        const Fac12 m_alpha12;
+        scale_sum12( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 , Fac9 alpha9 ,
+                Fac10 alpha10 , Fac11 alpha11 , Fac12 alpha12 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) , m_alpha9( alpha9 ) , m_alpha10( alpha10 ) , m_alpha11( alpha11 ) , m_alpha12( alpha12 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 , class T10 , class T11 , class T12 , class T13 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 , const T10 &t10 , const T11 &t11 , const T12 &t12 , const T13 &t13 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9 + m_alpha9 * t10 + m_alpha10 * t11 + m_alpha11 * t12 + m_alpha12 * t13;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 , class Fac9 = Fac8 , class Fac10 = Fac9 , class Fac11 = Fac10 , class Fac12 = Fac11 , class Fac13 = Fac12 >
+    struct scale_sum13
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        const Fac9 m_alpha9;
+        const Fac10 m_alpha10;
+        const Fac11 m_alpha11;
+        const Fac12 m_alpha12;
+        const Fac13 m_alpha13;
+        scale_sum13( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 , Fac9 alpha9 ,
+                Fac10 alpha10 , Fac11 alpha11 , Fac12 alpha12 , Fac13 alpha13 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) , m_alpha9( alpha9 ) , m_alpha10( alpha10 ) , m_alpha11( alpha11 ) , m_alpha12( alpha12 ) , m_alpha13( alpha13 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 , class T10 , class T11 , class T12 , class T13 , class T14 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 , const T10 &t10 , const T11 &t11 , const T12 &t12 , const T13 &t13 , const T14 &t14 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9 + m_alpha9 * t10 + m_alpha10 * t11 + m_alpha11 * t12 + m_alpha12 * t13 + m_alpha13 * t14;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 , class Fac3 = Fac2 , class Fac4 = Fac3 , class Fac5 = Fac4 , class Fac6 = Fac5 , class Fac7 = Fac6 , class Fac8 = Fac7 , class Fac9 = Fac8 , class Fac10 = Fac9 , class Fac11 = Fac10 , class Fac12 = Fac11 , class Fac13 = Fac12 , class Fac14 = Fac13 >
+    struct scale_sum14
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        const Fac3 m_alpha3;
+        const Fac4 m_alpha4;
+        const Fac5 m_alpha5;
+        const Fac6 m_alpha6;
+        const Fac7 m_alpha7;
+        const Fac8 m_alpha8;
+        const Fac9 m_alpha9;
+        const Fac10 m_alpha10;
+        const Fac11 m_alpha11;
+        const Fac12 m_alpha12;
+        const Fac13 m_alpha13;
+        const Fac14 m_alpha14;
+        scale_sum14( Fac1 alpha1 , Fac2 alpha2 , Fac3 alpha3 , Fac4 alpha4 ,
+                Fac5 alpha5 , Fac6 alpha6 , Fac7 alpha7 , Fac8 alpha8 , Fac9 alpha9 ,
+                Fac10 alpha10 , Fac11 alpha11 , Fac12 alpha12 , Fac13 alpha13 , Fac14 alpha14 )
+        : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) , m_alpha3( alpha3 ) , m_alpha4( alpha4 ) , m_alpha5( alpha5 ) , m_alpha6( alpha6 ) , m_alpha7( alpha7 ) , m_alpha8( alpha8 ) , m_alpha9( alpha9 ) , m_alpha10( alpha10 ) , m_alpha11( alpha11 ) , m_alpha12( alpha12 ) , m_alpha13( alpha13 ) , m_alpha14( alpha14 ) { }
+        template< class T1 , class T2 , class T3 , class T4 , class T5 , class T6 , class T7 , class T8 , class T9 , class T10 , class T11 , class T12 , class T13 , class T14 , class T15 >
+ __device__ __host__       void operator()( T1 &t1 , const T2 &t2 , const T3 &t3 , const T4 &t4 , const T5 &t5 , const T6 &t6 , const T7 &t7 , const T8 &t8 , const T9 &t9 , const T10 &t10 , const T11 &t11 , const T12 &t12 , const T13 &t13 , const T14 &t14 , const T15 &t15 ) const
+        {
+            t1 = m_alpha1 * t2 + m_alpha2 * t3 + m_alpha3 * t4 + m_alpha4 * t5 + m_alpha5 * t6 + m_alpha6 * t7 + m_alpha7 * t8 + m_alpha8 * t9 + m_alpha9 * t10 + m_alpha10 * t11 + m_alpha11 * t12 + m_alpha12 * t13 + m_alpha13 * t14 + m_alpha14 * t15;
+        }
+        typedef void result_type;
+    };
+    template< class Fac1 = double , class Fac2 = Fac1 >
+    struct scale_sum_swap2
+    {
+        const Fac1 m_alpha1;
+        const Fac2 m_alpha2;
+        scale_sum_swap2( Fac1 alpha1 , Fac2 alpha2 ) : m_alpha1( alpha1 ) , m_alpha2( alpha2 ) { }
+        template< class T1 , class T2 , class T3 >
+ __device__ __host__       void operator()( T1 &t1 , T2 &t2 , const T3 &t3) const
+        {
+            const T1 tmp( t1 );
+            t1 = m_alpha1 * t2 + m_alpha2 * t3;
+            t2 = tmp;
+        }
+        typedef void result_type;
+    };
+    /*
+     * for usage in for_each2
+     *
+     * Works with boost::units by eliminating the unit
+     */
+    template< class Fac1 = double >
+    struct rel_error
+    {
+        const Fac1 m_eps_abs , m_eps_rel , m_a_x , m_a_dxdt;
+        rel_error( Fac1 eps_abs , Fac1 eps_rel , Fac1 a_x , Fac1 a_dxdt )
+        : m_eps_abs( eps_abs ) , m_eps_rel( eps_rel ) , m_a_x( a_x ) , m_a_dxdt( a_dxdt ) { }
+        template< class T1 , class T2 , class T3 >
+ __device__ __host__       void operator()( T3 &t3 , const T1 &t1 , const T2 &t2 ) const
+        {
+            using std::abs;
+            set_unit_value( t3 , abs( get_unit_value( t3 ) ) / ( m_eps_abs + m_eps_rel * ( m_a_x * abs( get_unit_value( t1 ) ) + m_a_dxdt * abs( get_unit_value( t2 ) ) ) ) );
+        }
+        typedef void result_type;
+    };
+    /*
+     * for usage in for_each3
+     *
+     * used in the controller for the rosenbrock4 method
+     *
+     * Works with boost::units by eliminating the unit
+     */
+    template< class Fac1 = double >
+    struct default_rel_error
+    {
+        const Fac1 m_eps_abs , m_eps_rel ;
+        default_rel_error( Fac1 eps_abs , Fac1 eps_rel )
+        : m_eps_abs( eps_abs ) , m_eps_rel( eps_rel ) { }
+        /*
+         * xerr = xerr / ( eps_abs + eps_rel * max( x , x_old ) )
+         */
+        template< class T1 , class T2 , class T3 >
+ __device__ __host__       void operator()( T3 &t3 , const T1 &t1 , const T2 &t2 ) const
+        {
+            BOOST_USING_STD_MAX();
+            using std::abs;
+            Fac1 x1 = abs( get_unit_value( t1 ) ) , x2 = abs( get_unit_value( t2 ) );
+            set_unit_value( t3 , abs( get_unit_value( t3 ) ) / ( m_eps_abs + m_eps_rel * max BOOST_PREVENT_MACRO_SUBSTITUTION ( x1 , x2 ) ) );
+        }
+        typedef void result_type;
+    };
+    /*
+     * for usage in reduce
+     */
+    template< class Value >
+    struct maximum
+    {
+        template< class Fac1 , class Fac2 >
+  __device__ __host__      Value operator()( Fac1 t1 , const Fac2 t2 ) const
+        {
+            using std::abs;
+            Value a1 = abs( get_unit_value( t1 ) ) , a2 = abs( get_unit_value( t2 ) );
+            return ( a1 < a2 ) ? a2 : a1 ;
+        }
+        typedef Value result_type;
+    };
+    template< class Fac1 = double >
+    struct rel_error_max
+    {
+        const Fac1 m_eps_abs , m_eps_rel;
+        rel_error_max( Fac1 eps_abs , Fac1 eps_rel )
+        : m_eps_abs( eps_abs ) , m_eps_rel( eps_rel )
+        { }
+        template< class Res , class T1 , class T2 , class T3 >
+__device__ __host__        Res operator()( Res r , const T1 &x_old , const T2 &x , const T3 &x_err )
+        {
+            BOOST_USING_STD_MAX();
+            using std::abs;
+            Res tmp = abs( get_unit_value( x_err ) ) / ( m_eps_abs + m_eps_rel * max BOOST_PREVENT_MACRO_SUBSTITUTION ( abs( x_old ) , abs( x ) ) );
+            return max BOOST_PREVENT_MACRO_SUBSTITUTION ( r , tmp );
+        }
+    };
+    template< class Fac1 = double >
+    struct rel_error_max2
+    {
+        const Fac1 m_eps_abs , m_eps_rel , m_a_x , m_a_dxdt;
+        rel_error_max2( Fac1 eps_abs , Fac1 eps_rel , Fac1 a_x , Fac1 a_dxdt )
+        : m_eps_abs( eps_abs ) , m_eps_rel( eps_rel ) , m_a_x( a_x ) , m_a_dxdt( a_dxdt )
+        { }
+        template< class Res , class T1 , class T2 , class T3 , class T4 >
+__device__ __host__        Res operator()( Res r , const T1 &x_old , const T2 &/*x*/ , const T3 &dxdt_old , const T4 &x_err )
+        {
+            BOOST_USING_STD_MAX();
+            using std::abs;
+            Res tmp = abs( get_unit_value( x_err ) ) /
+                    ( m_eps_abs + m_eps_rel * ( m_a_x * abs( get_unit_value( x_old ) ) + m_a_dxdt * abs( get_unit_value( dxdt_old ) ) ) );
+            return max BOOST_PREVENT_MACRO_SUBSTITUTION ( r , tmp );
+        }
+    };
+    template< class Fac1 = double >
+    struct rel_error_l2
+    {
+        const Fac1 m_eps_abs , m_eps_rel;
+        rel_error_l2( Fac1 eps_abs , Fac1 eps_rel )
+        : m_eps_abs( eps_abs ) , m_eps_rel( eps_rel )
+        { }
+        template< class Res , class T1 , class T2 , class T3 >
+__device__ __host__        Res operator()( Res r , const T1 &x_old , const T2 &x , const T3 &x_err )
+        {
+            BOOST_USING_STD_MAX();
+            using std::abs;
+            Res tmp = abs( get_unit_value( x_err ) ) / ( m_eps_abs + m_eps_rel * max BOOST_PREVENT_MACRO_SUBSTITUTION ( abs( x_old ) , abs( x ) ) );
+            return r + tmp * tmp;
+        }
+    };
+    template< class Fac1 = double >
+    struct rel_error_l2_2
+    {
+        const Fac1 m_eps_abs , m_eps_rel , m_a_x , m_a_dxdt;
+        rel_error_l2_2( Fac1 eps_abs , Fac1 eps_rel , Fac1 a_x , Fac1 a_dxdt )
+        : m_eps_abs( eps_abs ) , m_eps_rel( eps_rel ) , m_a_x( a_x ) , m_a_dxdt( a_dxdt )
+        { }
+        template< class Res , class T1 , class T2 , class T3 , class T4 >
+__device__ __host__        Res operator()( Res r , const T1 &x_old , const T2 &/*x*/ , const T3 &dxdt_old , const T4 &x_err )
+        {
+            using std::abs;
+            Res tmp = abs( get_unit_value( x_err ) ) /
+                    ( m_eps_abs + m_eps_rel * ( m_a_x * abs( get_unit_value( x_old ) ) + m_a_dxdt * abs( get_unit_value( dxdt_old ) ) ) );
+            return r + tmp * tmp;
+        }
+    };
+    } // odeint
+} // numeric
+} // boost
diff --git a/src/Operators/Vector/cuda/vector_dist_operators_cuda.cuh b/src/Operators/Vector/cuda/vector_dist_operators_cuda.cuh
index 0060b134..cce632d6 100644
--- a/src/Operators/Vector/cuda/vector_dist_operators_cuda.cuh
+++ b/src/Operators/Vector/cuda/vector_dist_operators_cuda.cuh
@@ -26,11 +26,11 @@ struct SubsetSelector_impl<true>
     template<typename particle_type,typename subset_type>
     static void check(particle_type &particles,subset_type &particle_subset){
-        if(particles.getMapCtr()!=particle_subset.getUpdateCtr())
+        //This getMapCtr needs to be created or fixed for cuda!
+       /* if(particles.getMapCtr()!=particle_subset.getUpdateCtr())
             std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a subset update after map."<<std::endl;
-        }
+        }*/
@@ -80,7 +80,6 @@ struct pos_or_propL_ker
 /*! \brief selector for position or properties left side
  * \tparam vector type of the original vector
@@ -428,7 +427,7 @@ struct vector_dist_op_compute_op<prp,false,comp_host>
+#define NVCC
 #ifdef __NVCC__
 template<unsigned int prp, unsigned int dim ,typename vector, typename expr>
diff --git a/src/Operators/Vector/vector_dist_operators.hpp b/src/Operators/Vector/vector_dist_operators.hpp
index cb1eac3b..25905116 100644
--- a/src/Operators/Vector/vector_dist_operators.hpp
+++ b/src/Operators/Vector/vector_dist_operators.hpp
@@ -9,6 +9,7 @@
 #include "Vector/vector_dist.hpp"
+#include "Vector/vector_dist_subset.hpp"
 #include "lib/pdata.hpp"
 #include "cuda/vector_dist_operators_cuda.cuh"
@@ -781,7 +782,7 @@ public:
 	//! return the result of the expression
-	template<typename r_type=typename std::remove_reference<decltype(-(o1.value(vect_dist_key_dx(0))))>::type > 
+	template<typename r_type=typename std::remove_reference<decltype(-(o1.value(vect_dist_key_dx(0))))>::type >
 	__device__ __host__  inline r_type value(const vect_dist_key_dx & key) const
 		return -(o1.value(key));
@@ -823,6 +824,27 @@ struct vector_dist_expression_comp_sel<comp_dev,false>
 	typedef boost::mpl::int_<-1> type;
+/*! \brief Expression implementation computation selector
+ *
+ */
+template<bool cond>
+struct vector_dist_expression_comp_proxy_sel
+    template<bool cond_, typename v_type, typename exp_type>
+    static void compute(v_type &v,exp_type &v_exp)
+    { vector_dist_op_compute_op<0,false,vector_dist_expression_comp_sel<comp_dev,cond_>::type::value>
+        ::compute_expr(v,v_exp);}
+struct vector_dist_expression_comp_proxy_sel<false>
+    template<bool cond, typename v_type, typename exp_type>
+    static void compute(v_type &v, exp_type &v_exp)
+    {   auto v_ker=v.toKernel();
+        vector_dist_op_compute_op<0,false,vector_dist_expression_comp_sel<comp_dev,cond>::type::value>
+        ::compute_expr(v_ker,v_exp);}
 template<typename vector, bool is_ker = has_vector_kernel<vector>::type::value>
 struct vector_expression_transform
@@ -1040,15 +1062,37 @@ public:
 	 * \return itself
-	template<typename T> vector & operator=(const vector_dist_expression<0,openfpm::vector<aggregate<T>>> & v_exp)
+	template<typename T,typename memory,template <typename> class layout_base > vector & operator=(const vector_dist_expression<0,openfpm::vector<aggregate<T>, memory, layout_base>> & v_exp)
-		vector_dist_op_compute_op<prp,false,vector_dist_expression_comp_sel<comp_host,
-																	   	  has_vector_kernel<vector>::type::value>::type::value>
-		::compute_expr(v.v,v_exp);
+		//vector_dist_op_compute_op<prp,false,vector_dist_expression_comp_sel<comp_host,has_vector_kernel<vector>::type::value>::type::value>
+		//::compute_expr(v.v,v_exp);
+            vector_dist_op_compute_op<prp,false,vector_dist_expression_comp_sel<comp_host,
+                    has_vector_kernel<vector>::type::value>::type::value>
+            ::compute_expr(v.v,v_exp);
 		return v.v;
+    /*! \brief Fill the vector property with the evaluated expression
+ *
+ * \param v_exp expression to evaluate
+ *
+ * \return itself
+ *
+ */
+    template<typename T> vector & operator=(const vector_dist_expression<0,openfpm::vector_gpu<aggregate<T>>> & v_exp)
+    {
+            vector_dist_op_compute_op<prp,false,vector_dist_expression_comp_sel<comp_dev,
+                    has_vector_kernel<vector>::type::value>::type::value>
+            ::compute_expr(v.v,v_exp.getVector().toKernel());
+        //constexpr bool cond=has_vector_kernel<vector>::type::value || std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value;
+        //vector_dist_expression_comp_proxy_sel<!std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value>::template compute<cond>(v.v,v_exp);
+        return v.v;
+    }
 	/*! \brief Fill the vector property with the evaluated expression
 	 * \param v_exp expression to evaluate
@@ -1139,45 +1183,45 @@ public:
  * \tparam vector involved
-template<typename T>
-class vector_dist_expression<0,openfpm::vector<aggregate<T>> >
+template<typename vector_type>
+class vector_dist_expression_impl
-	//! Internal vector
-	typedef openfpm::vector<aggregate<T>> vector;
-	//! The temporal vector
-	mutable vector v;
+    //! Internal vector
+    typedef vector_type vector;
+    typedef typename boost::mpl::at<typename vector_type::value_type::type,boost::mpl::int_<0>>::type T;
+    //! The temporal vector
+    mutable vector v;
     typedef T * iterator;
     typedef const  T * const_iterator;
-	typedef typename has_vector_kernel<vector>::type is_ker;
+    typedef typename has_vector_kernel<vector>::type is_ker;
-	//! The type of the internal vector
-	typedef vector vtype;
+    //! The type of the internal vector
+    typedef vector vtype;
     //! The type of the internal value
     typedef T value_type;
     //! result for is sort
-	typedef boost::mpl::bool_<false> is_sort;
+    typedef boost::mpl::bool_<false> is_sort;
-	//! NN_type
-	typedef void NN_type;
+    //! NN_type
+    typedef void NN_type;
-	//! Property id of the point
-	static const unsigned int prop = 0;
+    //! Property id of the point
+    static const unsigned int prop = 0;
-	int var_id = 0;
+    int var_id = 0;
-	void setVarId(int var_id)
-	{
-		this->var_id = var_id;
-	}
+    void setVarId(int var_id)
+    {
+        this->var_id = var_id;
+    }
-	///////// BOOST ODEINT interface
+    ///////// BOOST ODEINT interface
     iterator begin()
     { return &v.template get<0>(0); }
@@ -1193,11 +1237,11 @@ public:
     size_t size() const
     { return v.size(); }
-	void resize(size_t n)
+    void resize(size_t n)
-	    // Here
+        // Here
-	    v.resize(n);
+        v.resize(n);
 /*	T * begin() {
@@ -1217,161 +1261,171 @@ public:
     //{ return m_v[n]; }
-	////////////////////////////////////
+    ////////////////////////////////////
-	vector_dist_expression()
-	{}
+    vector_dist_expression_impl()
+    {}
-	template<typename exp1, typename exp2, unsigned int op>
-	vector_dist_expression(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
-	{
-		this->operator=(v_exp);
-	}
+    template<unsigned int prp2, typename vector2>
+    vector_dist_expression_impl(const vector_dist_expression<prp2,vector2> & v_exp)
+    {
+        this->operator=(v_exp);
+    };
-	/*! \brief get the NN object
-	 *
-	 * \return the NN object
-	 *
-	 */
-	inline void * getNN() const
-	{
-		return NULL;
-	}
+    template<typename exp1, typename exp2, unsigned int op>
+    vector_dist_expression_impl(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
+    {
+        this->operator=(v_exp);
+    }
-	/*! \brief Return the vector on which is acting
-	 *
-	 * It return the vector used in getVExpr, to get this object
-	 *
-	 * \return the vector
-	 *
-	 */
-	__device__ __host__ const vector & getVector() const
-	{
-		return v;
-	}
+    /*! \brief get the NN object
+     *
+     * \return the NN object
+     *
+     */
+    inline void * getNN() const
+    {
+        return NULL;
+    }
-	/*! \brief Return the vector on which is acting
-	 *
-	 * It return the vector used in getVExpr, to get this object
-	 *
-	 * \return the vector
-	 *
-	 */
-	__device__ __host__ vector & getVector()
-	{
-		return v;
-	}
+    /*! \brief Return the vector on which is acting
+     *
+     * It return the vector used in getVExpr, to get this object
+     *
+     * \return the vector
+     *
+     */
+    __device__ __host__ const vector & getVector() const
+    {
+        return v;
+    }
-	/*! \brief This function must be called before value
-	 *
-	 * it initialize the expression if needed
-	 *
-	 */
-	inline void init() const
-	{}
+    /*! \brief Return the vector on which is acting
+     *
+     * It return the vector used in getVExpr, to get this object
+     *
+     * \return the vector
+     *
+     */
+    __device__ __host__ vector & getVector()
+    {
+        return v;
+    }
-	/*! \brief Evaluate the expression
-	 *
-	 * \param k where to evaluate the expression
-	 *
-	 * \return the result of the expression
-	 *
-	 */
-	__host__ inline auto value(const vect_dist_key_dx & k) const -> decltype(v.template get<0>(k.getKey()))
-	{
-		return v.template get<0>(k.getKey());
-	}
+    /*! \brief This function must be called before value
+     *
+     * it initialize the expression if needed
+     *
+     */
+    inline void init() const
+    {}
+    /*! \brief Evaluate the expression
+     *
+     * \param k where to evaluate the expression
+     *
+     * \return the result of the expression
+     *
+     */
+    __host__ __device__ inline auto value(const vect_dist_key_dx & k) const -> decltype(v.template get<0>(k.getKey()))
+    {
+        return v.template get<0>(k.getKey());
+    }
-	/*! \brief Fill the vector property with the evaluated expression
-	 *
-	 * \param v_exp expression to evaluate
-	 *
-	 * \return itself
-	 *
-	 */
-	template<unsigned int prp2, typename vector2> vector & operator=(const vector_dist_expression<prp2,vector2> & v_exp)
-	{
+    /*! \brief Fill the vector property with the evaluated expression
+     *
+     * \param v_exp expression to evaluate
+     *
+     * \return itself
+     *
+     */
+    template<unsigned int prp2, typename vector2> vector & operator=(const vector_dist_expression<prp2,vector2> & v_exp)
+    {
         if (v_exp.getVector().isSubset() == true)
-                        std::cout << __FILE__ << ":" << __LINE__ << " error on the right hand side of the expression you have to use non-subset properties" << std::endl;
-                        return v;
+            std::cout << __FILE__ << ":" << __LINE__ << " error on the right hand side of the expression you have to use non-subset properties" << std::endl;
+            return v;
+        constexpr bool cond=has_vector_kernel<vector>::type::value || std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value;
+        //std::cout<<cond<<std::endl;
+        //std::cout<< (vector_dist_expression_comp_sel<comp_host,has_vector_kernel<vector>::type::value>::type::value || std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value)<<std::endl;
+        //std::cout<<(vector_dist_expression_comp_sel<2,
+           //     has_vector_kernel<vector>::type::value>::type::value || std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value)<<std::endl;
+        //std::cout<<has_vector_kernel<vector>::type::value<<std::endl;
+        //std::cout<<vector_dist_expression_comp_sel<2,false>::type::value<<std::endl;
+        //std::cout<<!std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value<<std::endl;
+        if (has_vector_kernel<vector>::type::value == false && !std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value)
+        {
+            vector_dist_op_compute_op<0,false,vector_dist_expression_comp_sel<comp_host,cond>::type::value>
+            ::compute_expr(v,v_exp);
+        }
+        else
+        {
+            vector_dist_expression_comp_proxy_sel<!std::is_same<vector,openfpm::vector<aggregate<T>,CudaMemory,memory_traits_inte>>::value>::template compute<cond>(v,v_exp);
+        }
-		if (has_vector_kernel<vector>::type::value == false)
-		{
-			vector_dist_op_compute_op<0,false,vector_dist_expression_comp_sel<comp_host,
-																	   	  has_vector_kernel<vector>::type::value>::type::value>
-			::compute_expr(v,v_exp);
-		}
-		else
-		{
-			vector_dist_op_compute_op<0,false,vector_dist_expression_comp_sel<comp_dev,
-		   	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  has_vector_kernel<vector>::type::value>::type::value>
-			::compute_expr(v,v_exp);
-		}
-		return v;
-	}
+        return v;
+    }
-	/*! \brief Fill the vector property with the evaluated expression
-	 *
-	 * \param v_exp expression to evaluate
-	 *
-	 * \return itself
-	 *
-	 */
-	template<typename exp1, typename exp2, unsigned int op>
-	vector & operator=(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
-	{
+    /*! \brief Fill the vector property with the evaluated expression
+     *
+     * \param v_exp expression to evaluate
+     *
+     * \return itself
+     *
+     */
+    template<typename exp1, typename exp2, unsigned int op>
+    vector & operator=(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
+    {
         if (v_exp.getVector().isSubset() == true)
-        	std::cout << __FILE__ << ":" << __LINE__ << " error on the right hand side of the expression you have to use non-subset properties" << std::endl;
+            std::cout << __FILE__ << ":" << __LINE__ << " error on the right hand side of the expression you have to use non-subset properties" << std::endl;
             return v;
-		if (has_vector_kernel<vector>::type::value == false)
-		{
-			vector_dist_op_compute_op<0,
-									  vector_dist_expression_op<exp1,exp2,op>::is_sort::value,
-									  vector_dist_expression_comp_sel<comp_host,
-																	  has_vector_kernel<vector>::type::value>::type::value>
-			::compute_expr(v,v_exp);
-		}
-		else
-		{
-			vector_dist_op_compute_op<0,
-									  vector_dist_expression_op<exp1,exp2,op>::is_sort::value,
-									  vector_dist_expression_comp_sel<comp_dev,
-		   	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  has_vector_kernel<vector>::type::value>::type::value>
-			::compute_expr(v,v_exp);
-		}
+        if (has_vector_kernel<vector>::type::value == false)
+        {
+            vector_dist_op_compute_op<0,
+                    vector_dist_expression_op<exp1,exp2,op>::is_sort::value,
+                    vector_dist_expression_comp_sel<comp_host,
+                            has_vector_kernel<vector>::type::value>::type::value>
+            ::compute_expr(v,v_exp);
+        }
+        else
+        {
+            vector_dist_op_compute_op<0,
+                    vector_dist_expression_op<exp1,exp2,op>::is_sort::value,
+                    vector_dist_expression_comp_sel<comp_dev,
+                            has_vector_kernel<vector>::type::value>::type::value>
+            ::compute_expr(v,v_exp);
+        }
-		return v;
-	}
+        return v;
+    }
-	/*! \brief Fill the vector property with the double
-	 *
-	 * \param d value to fill
-	 *
-	 * \return the internal vector
-	 *
-	 */
-	vector & operator=(double d)
-	{
-		std::cout << __FILE__ << ":" << __LINE__ << " Error: temporal with constants is unsupported" << std::endl;
-	}
+    /*! \brief Fill the vector property with the double
+     *
+     * \param d value to fill
+     *
+     * \return the internal vector
+     *
+     */
+    vector & operator=(double d)
+    {
+        std::cout << __FILE__ << ":" << __LINE__ << " Error: temporal with constants is unsupported" << std::endl;
+    }
     template<typename Sys_eqs, typename pmap_type, typename unordered_map_type, typename coeff_type>
     inline void value_nz(pmap_type & p_map, const vect_dist_key_dx & key, unordered_map_type & cols, coeff_type & coeff, unsigned int comp) const
-    	std::cout << __FILE__ << ":" << __LINE__ << " Error: use of temporal is not supported to construct equations";
+        std::cout << __FILE__ << ":" << __LINE__ << " Error: use of temporal is not supported to construct equations";
     inline vector_dist_expression_op<vector_dist_expression<0,vector>,boost::mpl::int_<1>,VECT_COMP> operator[](int comp)
@@ -1386,8 +1440,89 @@ public:
+/*! \brief Sub class that encapsulate a vector properties operand to be used for expressions construction
+ *  Temporal Expressions
+ * \tparam prp property involved
+ * \tparam vector involved
+ *
+ */
+template<typename T, typename memory,template <typename> class layout_base >
+class vector_dist_expression<0,openfpm::vector<aggregate<T>,memory, layout_base> > : public vector_dist_expression_impl<openfpm::vector<aggregate<T>,memory, layout_base>>
+    typedef openfpm::vector<aggregate<T>,memory, layout_base> vector;
+    typedef vector_dist_expression_impl<vector> base;
+    vector_dist_expression()
+    {
+    }
+    template<unsigned int prp2, typename vector2>
+    vector_dist_expression(const vector_dist_expression<prp2,vector2> & v_exp)
+            :base(v_exp)
+    {
+    }
+    template<typename exp1, typename exp2, unsigned int op>
+    vector_dist_expression(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
+    : base(v_exp)
+    {
+    }
+    template<unsigned int prp2, typename vector2> vector & operator=(const vector_dist_expression<prp2,vector2> & v_exp)
+    {
+        return base::operator=(v_exp);
+    }
+    template<typename exp1, typename exp2, unsigned int op>
+    vector & operator=(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
+    {
+        return base::operator=(v_exp);
+    }
+/*! \brief Sub class that encapsulate a GPU vector properties operand to be used for expressions construction
+ *  Temporal Expressions
+ * \tparam prp property involved
+ * \tparam vector involved
+ *
+ */
+template<typename T>
+class vector_dist_expression<0,openfpm::vector_gpu<aggregate<T>>> : public vector_dist_expression_impl<openfpm::vector_gpu<aggregate<T>>>
+    typedef openfpm::vector_gpu<aggregate<T>> vector;
+    typedef vector_dist_expression_impl<vector> base;
+    vector_dist_expression()
+    {
+    }
+    template<unsigned int prp2, typename vector2>
+    vector_dist_expression(const vector_dist_expression<prp2,vector2> & v_exp)
+            :base(v_exp)
+    {
+    }
+    template<typename exp1, typename exp2, unsigned int op>
+    vector_dist_expression(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
+            : base(v_exp)
+    {
+    }
+    template<unsigned int prp2, typename vector2> vector & operator=(const vector_dist_expression<prp2,vector2> & v_exp)
+    {
+        return base::operator=(v_exp);
+    }
+    template<typename exp1, typename exp2, unsigned int op>
+    vector & operator=(const vector_dist_expression_op<exp1,exp2,op> & v_exp)
+    {
+        return base::operator=(v_exp);
+    }
+template<typename T> using texp_v = vector_dist_expression<0,openfpm::vector<aggregate<T>>>;
+template<typename T> using texp_v_gpu = vector_dist_expression<0,openfpm::vector_gpu<aggregate<T>>>;
-template<typename T> using texp_v = vector_dist_expression<0,openfpm::vector<aggregate<T>> >;
 template<typename vector, unsigned int impl>
 struct switcher_get_v
@@ -1416,7 +1551,97 @@ struct switcher_get_v<vector,comp_dev>
-/*! \brief it take an expression and create the negatove of this expression
+/*template<unsigned int, bool is_valid>
+struct get_vector_dist_expression_op
+	template<typename exp_type>
+	inline static auto get(exp_type & o1, const vect_dist_key_dx & key) -> decltype(o1.value(vect_dist_key_dx(0)))
+	{
+		return o1.value(key);
+	}
+	template<unsigned int prop, typename exp_type, typename vector_type>
+	inline static void assign(exp_type & o1, vector_type & v, const vect_dist_key_dx & key, const vect_dist_key_dx & key_orig)
+	{
+		pos_or_propL<vector_type,exp_type::prop>::value(v,key) = o1.value(key_orig);
+	}
+	template<unsigned int prop, typename vector_type>
+	inline static void assign_double(double d, vector_type & v, const vect_dist_key_dx & key)
+	{
+		pos_or_propL<vector_type,prop>::value(v,key) = d;
+	}
+struct get_vector_dist_expression_op<1,false>
+	template<typename exp_type>
+	static int get(exp_type & o1, const vect_dist_key_dx & key, const int (& comp)[1])
+	{
+		printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+		return 0;
+	}
+	template<unsigned int prop, typename exp_type, typename vector_type>
+	inline static void assign(exp_type & o1, vector_type & v, const vect_dist_key_dx & key)
+	{
+		printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+	}
+	template<unsigned int prop, typename vector_type>
+	inline static void assign_double(double d, vector_type & v, const vect_dist_key_dx & key)
+	{
+		printf("ERROR: Slicer, the expression is incorrect, please check it\n");
+	}
+struct get_vector_dist_expression_op<1,true>
+	template<typename exp_type>
+	static auto get(exp_type & o1, const vect_dist_key_dx & key, const int (& comp)[1]) -> decltype(o1.value(vect_dist_key_dx(0))[0])
+	{
+		return o1.value(key)[comp[0]];
+	}
+	template<unsigned int prop,typename exp_type, typename vector_type>
+	inline static void assign(exp_type & o1, vector_type & v, const vect_dist_key_dx & key, const vect_dist_key_dx & key_orig, const int (& comp)[1])
+	{
+		pos_or_propL<vector_type,prop>::value(v,key)[comp[0]] = o1.value(key_orig);
+	}
+	template<unsigned int prop, typename vector_type>
+	inline static void assign_double(double d, vector_type & v, const vect_dist_key_dx & key, const int (& comp)[1])
+	{
+		pos_or_propL<vector_type,prop>::value(v,key)[comp[0]] = d;
+	}
+struct get_vector_dist_expression_op<2,true>
+	template<typename exp_type>
+	static auto get(exp_type & o1, const vect_dist_key_dx & key, const int (& comp)[2]) -> decltype(o1.value(vect_dist_key_dx(0))[0][0])
+	{
+		return o1.value(key)[comp[0]][comp[1]];
+	}
+	template<unsigned int prop,typename exp_type, typename vector_type>
+	inline static void assign(exp_type & o1, vector_type & v, const vect_dist_key_dx & key, const vect_dist_key_dx & key_orig, const int (& comp)[2])
+	{
+		pos_or_propL<vector_type,prop>::value(v,key)[comp[0]][comp[1]] = o1.value(key_orig);
+	}
+	template<unsigned int prop, typename vector_type>
+	inline static void assign_double(double d, vector_type & v, const vect_dist_key_dx & key, const int (& comp)[2])
+	{
+		pos_or_propL<vector_type,prop>::value(v,key)[comp[0]][comp[1]] = d;
+	}
+/*! \brief it take an expression and take the component
@@ -1572,7 +1797,7 @@ public:
 	 * \return itself
-    template<typename T> vtype & operator=(const vector_dist_expression<0,openfpm::vector<aggregate<T>>> & v_exp)
+    template<typename T, typename memory> vtype & operator=(const vector_dist_expression<0,openfpm::vector<aggregate<T>,memory>> & v_exp)
@@ -1949,9 +2174,9 @@ operator+(const vector_dist_expression<prp1,v1> & va, double d)
  * \return an object that encapsulate the expression
-template<unsigned int prp1 , typename v1>
+template<typename T, unsigned int prp1, typename v1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<prp1,v1>,vector_dist_expression<0,float>,VECT_SUM>
-operator+(const vector_dist_expression<prp1,v1> & va, float d)
+operator+(const vector_dist_expression<prp1,v1> & va, T d)
 	vector_dist_expression_op<vector_dist_expression<prp1,v1>,vector_dist_expression<0,float>,VECT_SUM> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2017,9 +2242,9 @@ operator+(const vector_dist_expression_op<exp1,exp2,op1> & va, double d)
  * \return an object that encapsulate the expression
-template<typename exp1 , typename exp2, unsigned int op1>
+template<typename T, typename exp1 , typename exp2, unsigned int op1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_SUM>
-operator+(const vector_dist_expression_op<exp1,exp2,op1> & va, float d)
+operator+(const vector_dist_expression_op<exp1,exp2,op1> & va, T d)
 	vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_SUM> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2153,9 +2378,10 @@ operator-(const vector_dist_expression<prp1,v1> & va, double d)
  * \return an object that encapsulate the expression
-template<unsigned int prp1, typename v1>
+//template<unsigned int prp1, typename v1>
+template<typename T, unsigned int prp1,typename v1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<prp1,v1>,vector_dist_expression<0,float>,VECT_SUB>
-operator-(const vector_dist_expression<prp1,v1> & va, float d)
+operator-(const vector_dist_expression<prp1,v1> & va, T d)
 	vector_dist_expression_op<vector_dist_expression<prp1,v1>,vector_dist_expression<0,float>,VECT_SUB> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2187,9 +2413,9 @@ operator-(double d, const vector_dist_expression<prp1,v1> & vb)
  * \return an object that encapsulate the expression
-template<unsigned int prp1, typename v1>
+template<typename T, unsigned int prp1,typename v1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression<prp1,v1>,VECT_SUB>
-operator-(float d, const vector_dist_expression<prp1,v1> & vb)
+operator-(T d, const vector_dist_expression<prp1,v1> & vb)
 	vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression<prp1,v1>,VECT_SUB> exp_sum(vector_dist_expression<0,float>(d),vb);
@@ -2221,9 +2447,9 @@ operator*(double d, const vector_dist_expression<p2,v2> & vb)
  * \return an object that encapsulate the expression
-template<unsigned int p2, typename v2>
+template<typename T, unsigned int p2,typename v2, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression<p2,v2>,VECT_MUL>
-operator*(float d, const vector_dist_expression<p2,v2> & vb)
+operator*(T d, const vector_dist_expression<p2,v2> & vb)
 	vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression<p2,v2>,VECT_MUL> exp_sum(vector_dist_expression<0,float>(d),vb);
@@ -2255,9 +2481,9 @@ operator*(const vector_dist_expression<p2,v2> & va, double d)
  * \return an object that encapsulate the expression
-template<unsigned int p2, typename v2>
+template<typename T, unsigned int p2,typename v2, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<p2,v2>,vector_dist_expression<0,float>,VECT_MUL>
-operator*(const vector_dist_expression<p2,v2> & va, float d)
+operator*(const vector_dist_expression<p2,v2> & va, T d)
 	vector_dist_expression_op<vector_dist_expression<p2,v2>,vector_dist_expression<0,float>,VECT_MUL> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2357,9 +2583,9 @@ operator*(const vector_dist_expression_op<exp1,exp2,op1> & va, double d)
  * \return an object that encapsulate the expression
-template<typename exp1 , typename exp2, unsigned int op1>
+template<typename T, typename exp1 , typename exp2, unsigned int op1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_MUL>
-operator*(const vector_dist_expression_op<exp1,exp2,op1> & va, float d)
+operator*(const vector_dist_expression_op<exp1,exp2,op1> & va, T d)
 	vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_MUL> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2391,9 +2617,9 @@ operator*(double d, const vector_dist_expression_op<exp1,exp2,op1> & vb)
  * \return an object that encapsulate the expression
-template<typename exp1 , typename exp2, unsigned int op1>
+template<typename T, typename exp1 , typename exp2, unsigned int op1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression_op<exp1,exp2,op1>,VECT_MUL>
-operator*(float d, const vector_dist_expression_op<exp1,exp2,op1> & vb)
+operator*(T d, const vector_dist_expression_op<exp1,exp2,op1> & vb)
 	vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression_op<exp1,exp2,op1>,VECT_MUL> exp_sum(vector_dist_expression<0,float>(d),vb);
@@ -2425,9 +2651,9 @@ operator/(const vector_dist_expression_op<exp1,exp2,op1> & va, double d)
  * \return an object that encapsulate the expression
-template<typename exp1, typename exp2, unsigned int op1>
+template<typename T, typename exp1 , typename exp2, unsigned int op1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_DIV>
-operator/(const vector_dist_expression_op<exp1,exp2,op1> & va, float d)
+operator/(const vector_dist_expression_op<exp1,exp2,op1> & va, T d)
 	vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_DIV> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2459,9 +2685,9 @@ operator/(double d, const vector_dist_expression_op<exp1,exp2,op1> & va)
  * \return an object that encapsulate the expression
-template<typename exp1, typename exp2, unsigned int op1>
+template<typename T, typename exp1 , typename exp2, unsigned int op1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_DIV>
-operator/(float d, const vector_dist_expression_op<exp1,exp2,op1> & va)
+operator/(T d, const vector_dist_expression_op<exp1,exp2,op1> & va)
 	vector_dist_expression_op<vector_dist_expression_op<exp1,exp2,op1>,vector_dist_expression<0,float>,VECT_DIV> exp_sum(vector_dist_expression<0,float>(d),va);
@@ -2493,9 +2719,9 @@ operator/(const vector_dist_expression<prp1,v1> & va, double d)
  * \return an object that encapsulate the expression
-template<unsigned int prp1, typename v1>
+template<typename T, unsigned int prp1,typename v1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<prp1,v1>,vector_dist_expression<0,float>,VECT_DIV>
-operator/(const vector_dist_expression<prp1,v1> & va, float d)
+operator/(const vector_dist_expression<prp1,v1> & va, T d)
 	vector_dist_expression_op<vector_dist_expression<prp1,v1>,vector_dist_expression<0,float>,VECT_DIV> exp_sum(va,vector_dist_expression<0,float>(d));
@@ -2527,9 +2753,9 @@ operator/(double d, const vector_dist_expression<prp1,v1> & va)
  * \return an object that encapsulate the expression
-template<unsigned int prp1, typename v1>
+template<typename T, unsigned int prp1,typename v1, typename sfinae = typename std::enable_if<std::is_same<T,float>::value>::type >
 inline vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression<prp1,v1>,VECT_DIV>
-operator/(float d, const vector_dist_expression<prp1,v1> & va)
+operator/(T d, const vector_dist_expression<prp1,v1> & va)
 	vector_dist_expression_op<vector_dist_expression<0,float>,vector_dist_expression<prp1,v1>,VECT_DIV> exp_sum(vector_dist_expression<0,float>(d),va);
diff --git a/src/Solvers/petsc_solver.hpp b/src/Solvers/petsc_solver.hpp
index 74c60035..6bb051f3 100644
--- a/src/Solvers/petsc_solver.hpp
+++ b/src/Solvers/petsc_solver.hpp
@@ -653,12 +653,12 @@ class petsc_solver<double>
 		// We set the Matrix operators
-		PETSC_SAFE_CALL(KSPSetOperators(ksp,A_,A_));
+    PETSC_SAFE_CALL(KSPSetOperators(ksp,A_,A_));
-		// if we are on on best solve set-up a monitor function
+    // if we are on on best solve set-up a monitor function
-		PETSC_SAFE_CALL(KSPSetFromOptions(ksp));
+    PETSC_SAFE_CALL(KSPSetFromOptions(ksp));
+    //PETSC_SAFE_CALL(KSPSetUp(ksp));
 		// Solve the system
@@ -1329,6 +1329,122 @@ public:
 		return x;
+    /*! \brief Here we invert the matrix and solve the system
+     *
+     * \param A sparse matrix
+     * \param b vector
+     * \param x solution and initial guess
+     *
+     * \return true if succeed
+     *
+     */
+    Vector<double,PETSC_BASE> solve(SparseMatrix<double,int,PETSC_BASE> & A, Vector<double,PETSC_BASE> & x, const Vector<double,PETSC_BASE> & b)
+    {
+        Mat & A_ = A.getMat();
+        const Vec & b_ = b.getVec();
+        Vec & x_ = x.getVec();
+       /* // We set the size of x according to the Matrix A
+        PetscInt row;
+        PetscInt col;
+        PetscInt row_loc;
+        PetscInt col_loc;*/
+        PETSC_SAFE_CALL(KSPSetInitialGuessNonzero(ksp,PETSC_TRUE));
+        /*PETSC_SAFE_CALL(MatGetSize(A_,&row,&col));
+        PETSC_SAFE_CALL(MatGetLocalSize(A_,&row_loc,&col_loc));*/
+        pre_solve_impl(A_,b_,x_);
+        solve_simple(A_,b_,x_);
+        x.update();
+        return x;
+        /*pre_solve_impl(A_,b_,x_);
+        solve_simple(A_,b_,x_);
+        x.update();
+        return true;*/
+    }
+    /*! \brief Here we invert the matrix and solve the system with previous operator
+     *
+     * \param A sparse matrix
+     * \param b vector
+     * \param x solution and initial guess
+     *
+     * \return true if succeed
+     *
+     */
+    Vector<double,PETSC_BASE> solve_successive(const Vector<double,PETSC_BASE> & b,bool initial_guess = false)
+    {
+        const Vec & b_ = b.getVec();
+        // We set the size of x according to the Matrix A
+        PetscInt row;
+        PetscInt row_loc;
+		PETSC_SAFE_CALL(KSPSetInitialGuessNonzero(ksp,PETSC_FALSE));
+        PETSC_SAFE_CALL(VecGetSize(b_,&row));
+        PETSC_SAFE_CALL(VecGetLocalSize(b_,&row_loc));
+        Vector<double,PETSC_BASE> x(row,row_loc);
+		Vec & x_ = x.getVec();
+        solve_simple(b_,x_);
+        x.update();
+        return x;
+        /*pre_solve_impl(A_,b_,x_);
+        solve_simple(A_,b_,x_);
+        x.update();
+        return true;*/
+    }
+    /*! \brief Here we invert the matrix and solve the system with previous operator and initial guess
+     *
+     * \param A sparse matrix
+     * \param b vector
+     * \param x solution and initial guess
+     *
+     * \return true if succeed
+     *
+     */
+    Vector<double,PETSC_BASE> solve_successive(Vector<double,PETSC_BASE> & x, const Vector<double,PETSC_BASE> & b)
+    {
+        const Vec & b_ = b.getVec();
+        Vec & x_ = x.getVec();
+       /* // We set the size of x according to the Matrix A
+        PetscInt row;
+        PetscInt col;
+        PetscInt row_loc;
+        PetscInt col_loc;*/
+        PETSC_SAFE_CALL(KSPSetInitialGuessNonzero(ksp,PETSC_TRUE));
+        /*PETSC_SAFE_CALL(MatGetSize(A_,&row,&col));
+        PETSC_SAFE_CALL(MatGetLocalSize(A_,&row_loc,&col_loc));*/
+        solve_simple(b_,x_);
+        x.update();
+        return x;
+        /*pre_solve_impl(A_,b_,x_);
+        solve_simple(A_,b_,x_);
+        x.update();
+        return true;*/
+    }
     /*! \brief Here we invert the matrix and solve the system using a Nullspace for Neumann BC
      *  \warning umfpack is not a parallel solver, this function work only with one processor
@@ -1344,7 +1460,7 @@ public:
      * \return the solution
-    Vector<double,PETSC_BASE> with_constant_nullspace_solve(SparseMatrix<double,int,PETSC_BASE> & A, const Vector<double,PETSC_BASE> & b, bool initial_guess = false)
+    Vector<double,PETSC_BASE> with_nullspace_solve(SparseMatrix<double,int,PETSC_BASE> & A, const Vector<double,PETSC_BASE> & b, bool initial_guess = false,bool symmetric = false)
         Mat & A_ = A.getMat();
         const Vec & b_ = b.getVec();
@@ -1354,9 +1470,6 @@ public:
         PetscInt col;
         PetscInt row_loc;
         PetscInt col_loc;
-        MatNullSpace nullspace;
@@ -1365,14 +1478,48 @@ public:
         Vector<double,PETSC_BASE> x(row,row_loc);
         Vec & x_ = x.getVec();
-        //Removing Null Space from RHS
-        PETSC_SAFE_CALL(MatNullSpaceCreate(PETSC_COMM_WORLD,PETSC_TRUE,0,0,&nullspace));
-        PETSC_SAFE_CALL(MatNullSpaceRemove(nullspace,b_));
-        PETSC_SAFE_CALL(MatNullSpaceDestroy(&nullspace));
+        PETSC_SAFE_CALL(KSPSetFromOptions(ksp));
+        PETSC_SAFE_CALL(KSPSetOperators(ksp,A_,A_));
+        PETSC_SAFE_CALL(KSPSolve(ksp,b_,x_));
-        pre_solve_impl(A_,b_,x_);
-        solve_simple(A_,b_,x_);
+        Mat      F, work, V;
+        PetscInt N, rows;
+        /* Determine factorability */
+        PETSC_SAFE_CALL(MatGetLocalSize(A_, &rows, NULL));
+        /* Set MUMPS options, see MUMPS documentation for more information */
+        PETSC_SAFE_CALL(MatMumpsSetIcntl(F, 24, 1));
+        PETSC_SAFE_CALL(MatMumpsSetIcntl(F, 25, 1));
+        /* Perform factorization */
+        PETSC_SAFE_CALL(MatLUFactorSymbolic(F, A_, NULL, NULL, NULL));
+        PETSC_SAFE_CALL(MatLUFactorNumeric(F, A_, NULL));
+        /* This is the dimension of the null space */
+        PETSC_SAFE_CALL(MatMumpsGetInfog(F, 28, &N));
+        /* This will contain the null space in the columns */
+        PETSC_SAFE_CALL(MatDuplicate(V, MAT_DO_NOT_COPY_VALUES, &work));
+        PETSC_SAFE_CALL(MatMatSolve(F, work, V));
+        std::cout<<"Dimension:" << N;
+        Vec nvec[N];
+        for(int i=0;i<N;i++)
+        {
+            PETSC_SAFE_CALL(MatGetColumnVector(V,nvec[i],i));
+        }
+        MatNullSpace  nullspace;
+        PETSC_SAFE_CALL(MatNullSpaceCreate(PETSC_COMM_WORLD,PETSC_TRUE,N,nvec,&nullspace));
+        PETSC_SAFE_CALL(MatSetTransposeNullSpace(A_,nullspace));
+        PETSC_SAFE_CALL(MatSetNullSpace(A_,nullspace));
+        PETSC_SAFE_CALL(MatNullSpaceDestroy(&nullspace));
+        PETSC_SAFE_CALL(KSPSetOperators(ksp,A_,A_));
+        PETSC_SAFE_CALL(KSPSetFromOptions(ksp));
+        PETSC_SAFE_CALL(KSPSolve(ksp,b_,x_));
         return x;
@@ -1419,30 +1566,6 @@ public:
 		return getSolNormError(b.getVec(),x.getVec(),ksp);
-	/*! \brief Here we invert the matrix and solve the system
-	 *
-	 * \param A sparse matrix
-	 * \param b vector
-	 * \param x solution and initial guess
-	 *
-	 * \return true if succeed
-	 *
-	 */
-	bool solve(SparseMatrix<double,int,PETSC_BASE> & A, Vector<double,PETSC_BASE> & x, const Vector<double,PETSC_BASE> & b)
-	{
-		Mat & A_ = A.getMat();
-		const Vec & b_ = b.getVec();
-		Vec & x_ = x.getVec();
-		PETSC_SAFE_CALL(KSPSetInitialGuessNonzero(ksp,PETSC_TRUE));
-		pre_solve_impl(A_,b_,x_);
-		solve_simple(A_,b_,x_);
-		x.update();
-		return true;
-	}
 	/*! \brief Here we invert the matrix and solve the system
 	 * \param b vector
diff --git a/src/interpolation/interpolation_unit_tests.cpp b/src/interpolation/interpolation_unit_tests.cpp
index 10844c4d..27038520 100644
--- a/src/interpolation/interpolation_unit_tests.cpp
+++ b/src/interpolation/interpolation_unit_tests.cpp
@@ -17,6 +17,8 @@
 #include "interpolation.hpp"
 #include <boost/math/special_functions/pow.hpp>
 #include <Vector/vector_dist.hpp>
+#include <Operators/Vector/vector_dist_operators.hpp>
+#include <FiniteDifference/FD_op.hpp>
 #include <Grid/grid_dist_id.hpp>
 BOOST_AUTO_TEST_SUITE( interpolation_test )
@@ -801,6 +803,447 @@ BOOST_AUTO_TEST_CASE( int_kernel_test )
+BOOST_AUTO_TEST_CASE( int_kernel_test_double)
+		mp4_kernel<double> mp4;
+		double tot = 0.0;
+		// Check momenta 0
+		tot += mp4.value(-1.3,0);
+		tot += mp4.value(-0.3,1);
+		tot += mp4.value(0.7,2);
+		tot += mp4.value(1.7,3);
+		BOOST_REQUIRE_CLOSE(tot,1.0,0.001);
+		// Check momenta 1
+		tot = 0.0;
+		tot += -1.3*mp4.value(-1.3,0);
+		tot += -0.3*mp4.value(-0.3,1);
+		tot += 0.7*mp4.value(0.7,2);
+		tot += 1.7*mp4.value(1.7,3);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// Check momenta 2
+		tot = 0.0;
+		tot += (1.3)*(1.3)*mp4.value(-1.3,0);
+		tot += (0.3)*(0.3)*mp4.value(-0.3,1);
+		tot += (0.7)*(0.7)*mp4.value(0.7,2);
+		tot += (1.7)*(1.7)*mp4.value(1.7,3);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		//////// Check zeta 1
+		tot = 0.0;
+		z_kernel<double,1> zk1;
+		tot += zk1.value(-0.3,0);
+		tot += zk1.value(0.7,1);
+		BOOST_REQUIRE_CLOSE(tot,1.0,0.001);
+		//////// zeta 2 is equivalent to mp4 we do not test
+		//////// zeta 3
+		z_kernel<double,3> zk3;
+		tot = 0.0;
+		// Check momenta 0
+		tot += zk3.value(-2.3,0);
+		tot += zk3.value(-1.3,1);
+		tot += zk3.value(-0.3,2);
+		tot += zk3.value(0.7,3);
+		tot += zk3.value(1.7,4);
+		tot += zk3.value(2.7,5);
+		BOOST_REQUIRE_CLOSE(tot,1.0,0.001);
+		// Check momenta 1
+		tot = 0.0;
+		tot += -2.3*zk3.value(-2.3,0);
+		tot += -1.3*zk3.value(-1.3,1);
+		tot += -0.3*zk3.value(-0.3,2);
+		tot += 0.7*zk3.value(0.7,3);
+		tot += 1.7*zk3.value(1.7,4);
+		tot += 2.7*zk3.value(2.7,5);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// Check momenta 2
+		tot = 0.0;
+		tot += 2.3*2.3*zk3.value(-2.3,0);
+		tot += 1.3*1.3*zk3.value(-1.3,1);
+		tot += 0.3*0.3*zk3.value(-0.3,2);
+		tot += 0.7*0.7*zk3.value(0.7,3);
+		tot += 1.7*1.7*zk3.value(1.7,4);
+		tot += 2.7*2.7*zk3.value(2.7,5);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// Check momenta 3
+		tot = 0.0;
+		tot += -2.3*-2.3*-2.3*zk3.value(-2.3,0);
+		tot += -1.3*-1.3*-1.3*zk3.value(-1.3,1);
+		tot += -0.3*-0.3*-0.3*zk3.value(-0.3,2);
+		tot += 0.7*0.7*0.7*zk3.value(0.7,3);
+		tot += 1.7*1.7*1.7*zk3.value(1.7,4);
+		tot += 2.7*2.7*2.7*zk3.value(2.7,5);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// z4
+		z_kernel<double,4> zk4;
+		// Check momenta 0
+		tot = 0.0;
+		tot += zk4.value(-3.3,0);
+		tot += zk4.value(-2.3,1);
+		tot += zk4.value(-1.3,2);
+		tot += zk4.value(-0.3,3);
+		tot += zk4.value(0.7,4);
+		tot += zk4.value(1.7,5);
+		tot += zk4.value(2.7,6);
+		tot += zk4.value(3.7,7);
+		BOOST_REQUIRE_CLOSE(tot,1.0,0.001);
+		// Check momenta 1
+		tot = 0.0;
+		tot += -3.3*zk4.value(-3.3,0);
+		tot += -2.3*zk4.value(-2.3,1);
+		tot += -1.3*zk4.value(-1.3,2);
+		tot += -0.3*zk4.value(-0.3,3);
+		tot += 0.7*zk4.value(0.7,4);
+		tot += 1.7*zk4.value(1.7,5);
+		tot += 2.7*zk4.value(2.7,6);
+		tot += 3.7*zk4.value(3.7,7);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// Check momenta 2
+		tot = 0.0;
+		tot += 3.3*3.3*zk4.value(-3.3,0);
+		tot += 2.3*2.3*zk4.value(-2.3,1);
+		tot += 1.3*1.3*zk4.value(-1.3,2);
+		tot += 0.3*0.3*zk4.value(-0.3,3);
+		tot += 0.7*0.7*zk4.value(0.7,4);
+		tot += 1.7*1.7*zk4.value(1.7,5);
+		tot += 2.7*2.7*zk4.value(2.7,6);
+		tot += 3.7*3.7*zk4.value(3.7,7);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// Check momenta 3
+		tot = 0.0;
+		tot += -3.3*-3.3*-3.3*zk4.value(-3.3,0);
+		tot += -2.3*-2.3*-2.3*zk4.value(-2.3,1);
+		tot += -1.3*-1.3*-1.3*zk4.value(-1.3,2);
+		tot += -0.3*-0.3*-0.3*zk4.value(-0.3,3);
+		tot += 0.7*0.7*0.7*zk4.value(0.7,4);
+		tot += 1.7*1.7*1.7*zk4.value(1.7,5);
+		tot += 2.7*2.7*2.7*zk4.value(2.7,6);
+		tot += 3.7*3.7*3.7*zk4.value(3.7,7);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+		// Check momenta 4
+		tot = 0.0;
+		tot += -3.3*-3.3*-3.3*-3.3*zk4.value(-3.3,0);
+		tot += -2.3*-2.3*-2.3*-2.3*zk4.value(-2.3,1);
+		tot += -1.3*-1.3*-1.3*-1.3*zk4.value(-1.3,2);
+		tot += -0.3*-0.3*-0.3*-0.3*zk4.value(-0.3,3);
+		tot += 0.7*0.7*0.7*0.7*zk4.value(0.7,4);
+		tot += 1.7*1.7*1.7*1.7*zk4.value(1.7,5);
+		tot += 2.7*2.7*2.7*2.7*zk4.value(2.7,6);
+		tot += 3.7*3.7*3.7*3.7*zk4.value(3.7,7);
+		BOOST_REQUIRE_SMALL(tot,0.001);
+        size_t res;
+        std::cout<<"Enter Res:";
+        std::cin>>res;
+        const size_t sz[2] = {res,res};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {PERIODIC, PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 *M_PI / (sz[0]);
+        spacing[1] = 2 *M_PI / (sz[1]);
+        Ghost<2,long int> gg(3);
+        double rCut = 3.0 * spacing[0];
+        Ghost<2, double> ghost(rCut);
+        vector_dist<2, double, aggregate<double, double>> particles(0, box,bc,ghost);
+        grid_dist_id<2, double, aggregate<double, double>> gd(particles.getDecomposition(),sz,gg);
+        double sigma2 = spacing[0] / (40.0);
+        std::normal_distribution<> gaussian{0, sigma2};
+        std::mt19937 rng{6666666};
+        auto it = particles.getGridIterator(sz);
+        while (it.isNext()) {
+            particles.add();
+            auto key = it.get();
+            double x=key.get(0) * spacing[0] + gaussian(rng);
+            double y=key.get(1) * spacing[1] + gaussian(rng);
+            particles.getLastPos()[0] = x;
+            particles.getLastPos()[1] = y;
+            // Here fill the function value
+            particles.template getLastProp<0>() = sin(particles.getLastPos()[0]) + sin(particles.getLastPos()[0]);
+            ++it;
+        }
+        particles.ghost_get<0>();
+        auto itG=gd.getDomainIterator();
+        while(itG.isNext())
+        {
+            auto key=itG.get();
+            gd.template getProp<1>(key) = sin(gd.getPos(key)[0]) + sin(gd.getPos(key)[0]);
+            ++itG;
+        }
+        particles.write("InitP");
+        gd.write("Grid");
+        auto Pu=getV<0>(particles);
+        auto Gu=FD::getV<0>(gd);
+        typedef vector_dist<2, double, aggregate<double, double>> particle_type;
+        typedef grid_dist_id<2, double, aggregate<double, double>> gd_type;
+        typedef z_kernel<double,4> kerneltype; //mp4_kernel<double>
+        typedef lambda4_4kernel<double> kerneltype2;
+        interpolate<particle_type,gd_type,kerneltype2> inte2m(particles,gd);
+        Gu=0;
+        gd.ghost_get<0>();
+        inte2m.template p2m<0,0>(particles,gd);
+        gd.template ghost_put<add_,0>();
+        gd.ghost_get<0>();
+        particles.write("InitPAfter");
+        gd.write("GridAfter");
+        auto it2 = gd.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(gd.template getProp<1>(p) - gd.template  getProp<0>(p)) > worst) {
+                worst = fabs(gd. template  getProp<1>(p) - gd.template  getProp<0>(p));
+            }
+            ++it2;
+        }
+        std::cout<<worst<<std::endl;
+        //BOOST_REQUIRE(worst < 0.03);
+        size_t res;
+        std::cout<<"Enter Res:";
+        std::cin>>res;
+        const size_t sz[2] = {res,res};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {PERIODIC, PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 *M_PI / (sz[0]);
+        spacing[1] = 2 *M_PI / (sz[1]);
+        Ghost<2,long int> gg(3);
+        double rCut = 3.0 * spacing[0];
+        Ghost<2, double> ghost(rCut);
+        vector_dist<2, double, aggregate<double, double>> particles(0, box,bc,ghost);
+        grid_dist_id<2, double, aggregate<double, double>> gd(particles.getDecomposition(),sz,gg);
+        double sigma2 = spacing[0] * spacing[1];
+        std::normal_distribution<> gaussian{0, sigma2};
+        std::mt19937 rng{6666666};
+        auto it = particles.getGridIterator(sz);
+        while (it.isNext()) {
+            particles.add();
+            auto key = it.get();
+            double x=key.get(0) * spacing[0] + gaussian(rng);
+            double y=key.get(1) * spacing[1] + gaussian(rng);
+            particles.getLastPos()[0] = x;
+            particles.getLastPos()[1] = y;
+            // Here fill the function value
+            particles.template getLastProp<0>() = 0;
+            particles.template getLastProp<1>() = sin(particles.getLastPos()[0]) + sin(particles.getLastPos()[0]);
+            ++it;
+        }
+        particles.ghost_get<0>();
+        auto itG=gd.getDomainIterator();
+        while(itG.isNext())
+        {
+            auto key=itG.get();
+            gd.template getProp<1>(key) = sin(gd.getPos(key)[0]) + sin(gd.getPos(key)[0]);
+            ++itG;
+        }
+        particles.write("InitP");
+        gd.write("Grid");
+        auto Pu=getV<0>(particles);
+        auto Gu=FD::getV<0>(gd);
+        typedef vector_dist<2, double, aggregate<double, double>> particle_type;
+        typedef grid_dist_id<2, double, aggregate<double, double>> gd_type;
+        typedef z_kernel<double,4> kerneltype; //mp4_kernel<double>
+        typedef lambda4_4kernel<double> kerneltype2;
+        interpolate<particle_type,gd_type,kerneltype2> inte2m(particles,gd);
+        Gu=0;
+        gd.ghost_get<0>();
+        inte2m.template m2p<1,0>(gd,particles);
+        particles.ghost_get<0>();
+        particles.write("InitPAfter");
+        gd.write("GridAfter");
+        auto it2 = particles.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(particles.template getProp<1>(p) - particles.template  getProp<0>(p)) > worst) {
+                worst = fabs(particles. template  getProp<1>(p) - particles.template  getProp<0>(p));
+            }
+            ++it2;
+        }
+        std::cout<<worst<<std::endl;
+        //BOOST_REQUIRE(worst < 0.03);
+        size_t res;
+        std::cin>>res;
+        const size_t sz[2] = {res,res};
+        Box<2, double> box({0, 0}, {2 * M_PI, 2 * M_PI});
+        size_t bc[2] = {PERIODIC, PERIODIC};
+        double spacing[2];
+        spacing[0] = 2 *M_PI / (sz[0]);
+        spacing[1] = 2 *M_PI / (sz[1]);
+        Ghost<2,long int> gg(3);
+        double rCut = 3.0 * spacing[0];
+        Ghost<2, double> ghost(rCut);
+        vector_dist<2, double, aggregate<double, VectorS<2, double>>> particles(0, box,bc,ghost),particlesMoved(0, box,bc,ghost);
+        grid_dist_id<2, double, aggregate<double, VectorS<2, double>>> gd(particles.getDecomposition(),sz,gg);
+        auto it = particles.getGridIterator(sz);
+        while (it.isNext()) {
+            particles.add();
+            auto key = it.get();
+            double x=key.get(0) * spacing[0];
+            double y=key.get(1) * spacing[1];
+            particles.getLastPos()[0] = x;
+            particles.getLastPos()[1] = y;
+            // Here fill the function value
+            particles.template getLastProp<1>() = 1.0;
+            particles.template getLastProp<1>() = 0;
+            particles.template getLastProp<0>() = 0;
+            if((x-3.14)*(x-3.14)+(y-3.14)*(y-3.14)<1)
+            {
+                particles.template getLastProp<0>() = 1;
+            }
+            ++it;
+        }
+        particles.ghost_get<0>();
+        particles.write("InitP");
+        gd.write("Grid");
+        auto Pu=getV<0>(particles);
+        auto Pmu=getV<0>(particlesMoved);
+        auto Gu=FD::getV<0>(gd);
+        typedef vector_dist<2, double, aggregate<double, VectorS<2, double>>> vd;
+        typedef grid_dist_id<2, double, aggregate<double, VectorS<2, double>>> gd_type;
+        interpolate<vd,gd_type,mp4_kernel<double>> inte2m(particlesMoved,gd);
+        interpolate<vd,gd_type,mp4_kernel<double>> inte2p(particles,gd);
+        double t=0,dt=0.5;
+        int ctr=0;
+        while(t<10)
+        {
+            particlesMoved.clear();
+            auto it=particles.getDomainIterator();
+            while(it.isNext())
+            {
+                auto p=it.get();
+                double xp=particles.getPos(p)[0],yp=particles.getPos(p)[1];
+                particlesMoved.add();
+                particlesMoved.getLastPos()[0] = xp+dt*particles.getProp<1>(p)[0];
+                particlesMoved.getLastPos()[1] = yp+dt*particles.getProp<1>(p)[1];
+                particlesMoved.getLastProp<0>() = particles.getProp<0>(p);
+                ++it;
+            }
+  ;
+            particlesMoved.ghost_get<0>();
+            Gu=0;
+            gd.ghost_get<0>();
+            inte2m.template p2m<0,0>(particlesMoved,gd);
+            gd.template ghost_put<add_,0>();
+            gd.ghost_get<0>();
+            Pu=0;
+            inte2p.template m2p<0,0>(gd,particles);
+            particles.write_frame("InitP",ctr);
+            gd.write_frame("Grid",ctr);
+            ctr++;
+            t+=dt;
+        }*/
+        auto it2 = domain.getDomainIterator();
+        double worst = 0.0;
+        while (it2.isNext()) {
+            auto p = it2.get();
+            if (fabs(domain.getProp<1>(p) - domain.getProp<2>(p)) > worst) {
+                worst = fabs(domain.getProp<1>(p) - domain.getProp<2>(p));
+            }
+            ++it2;
+        }
+ //       domain.deleteGhost();
+ //       BOOST_REQUIRE(worst < 0.03);
diff --git a/src/interpolation/lambda_kernel.hpp b/src/interpolation/lambda_kernel.hpp
index 1460d641..de447199 100644
--- a/src/interpolation/lambda_kernel.hpp
+++ b/src/interpolation/lambda_kernel.hpp
@@ -8,17 +8,17 @@
 #include <iostream>
 template<typename st>
-double horner(const std::array<double,10> &v, st x)
+double horner(const double *v, st x)
     st s = 0;
     for(int i=9; i>=0; i--)
         s = v[i] + (s * x);
     return s;
-constexpr std::array<double,10> c1={(1.0 * 12.0) , (0.0 * 12.0) , -(5.0 * 3.0) , (0.0 * 12.0) , (1.0 * 3.0) , -(100.0 * 4.0) , (455.0 * 3.0) , -(295.0 * 6.0) , (345.0 * 3.0) , -(115.0 * 2.0) };
-constexpr std::array<double,10> c2={-(199.0 * 24.0) , (5485.0 * 6.0) , -(32975.0 * 3.0) , (28425.0 * 6.0) , -(61953.0 * 3.0) , (33175.0 * 4.0) , -(20685.0 * 3.0) , (3055.0 * 6.0) , -(1035.0 * 3.0) , (115.0 * 2.0) };
-constexpr std::array<double,10> c3={(5913.0 * 24.0) , -(89235.0 * 6.0) , (297585.0 * 3.0) , -(143895.0 * 6.0) , (177871.0 * 3.0) , -(54641.0 * 4.0) , (19775.0 * 3.0) , -(1715.0 * 6.0) , (345.0 * 3.0) , -(23.0 * 2.0)};
+//These needs to be Checked
+double c1[10]={(1.0 * 12.0) , (0.0 * 12.0) , -(5.0 * 3.0) , (0.0 * 12.0) , (1.0 * 3.0) , -(100.0 * 4.0) , (455.0 * 3.0) , -(295.0 * 6.0) , (345.0 * 3.0) , -(115.0 * 2.0) };
+double c2[10]={-(199.0 * 24.0) , (5485.0 * 6.0) , -(32975.0 * 3.0) , (28425.0 * 6.0) , -(61953.0 * 3.0) , (33175.0 * 4.0) , -(20685.0 * 3.0) , (3055.0 * 6.0) , -(1035.0 * 3.0) , (115.0 * 2.0) };
+double c3[10]={(5913.0 * 24.0) , -(89235.0 * 6.0) , (297585.0 * 3.0) , -(143895.0 * 6.0) , (177871.0 * 3.0) , -(54641.0 * 4.0) , (19775.0 * 3.0) , -(1715.0 * 6.0) , (345.0 * 3.0) , -(23.0 * 2.0)};
 template<typename st>
@@ -44,4 +44,33 @@ public:
+template<typename st>
+double horner22(const double *v, st x)
+    st s = 0;
+    for(int i=5; i>=0; i--)
+        s = v[i] + (s * x);
+    return s;
+double c221[6]={1.0,0.0,-1.0,-4.5,7.5,-3.0};
+double c222[6]={-4.0,18.0,-29.0,21.5,-7.5,1.0};
+template<typename st>
+class lambda2_2kernel
+    static const int np = 6;
+    static inline st value(st x, size_t i)
+    {
+        if (i == 0)
+            return horner22(c221, -x);
+        else if (i == 1)
+            return horner22(c222, -x);
+        return 0.0;
+    }
diff --git a/src/level_set/closest_point/closest_point.hpp b/src/level_set/closest_point/closest_point.hpp
index 293ae493..f99f4e2a 100644
--- a/src/level_set/closest_point/closest_point.hpp
+++ b/src/level_set/closest_point/closest_point.hpp
@@ -17,6 +17,7 @@
 #ifndef __CLOSEST_POINT_HPP__
 #define __CLOSEST_POINT_HPP__
+#include "Grid/grid_dist_key.hpp"
 #include "algoim_hocp.hpp"
 // Width of extra padding around each grid patch needed to correctly construct kDTree in Algoim.
@@ -26,21 +27,20 @@ constexpr int algoim_padding = 4;
  * @file closest_point.hpp
  * @struct AlgoimWrapper
- * @tparam grid_type Type of the grid container
- * @tparam grid_key_type Type of the key for the grid container
- * @tparam dim Dimension of the space
  * @tparam wrapping_field Property id on the grid for the field to be wrapped
+ * @tparam grid_type Type of the grid container
+ * 
-template<typename grid_type, typename grid_key_type, unsigned int dim, size_t wrapping_field>
+template<size_t wrapping_field, typename grid_type, typename wrapping_field_type = typename boost::mpl::at<typename grid_type::value_type::type,boost::mpl::int_<wrapping_field>>::type>
 struct AlgoimWrapper
+    const static unsigned int dim = grid_type::dims;
     grid_type &gd;
     int patch_id;
     AlgoimWrapper(grid_type& ls_grid, const int pid) : gd(ls_grid), patch_id(pid) {}
     //! Call operator for the wrapper.
-    double operator() (const blitz::TinyVector<int,dim> idx) const
+    double operator() (const blitz::TinyVector<int, dim> idx) const
         long int local_key[dim];
@@ -50,28 +50,168 @@ struct AlgoimWrapper
             local_key[d] = idx(d) - algoim_padding;
         // Generate OpenFPM grid_key object from local grid indices
-        grid_key_type grid_key(patch_id, grid_key_dx<dim> (local_key) + ghost_offset);
+        grid_dist_key_dx<dim> grid_key(patch_id, grid_key_dx<dim> (local_key) + ghost_offset);
         return gd.template get<wrapping_field>(grid_key);
+  template<size_t extend_field_temp, int poly_order, typename coord_type, typename dx_type, typename pos_type, typename key_type>
+  void extend(coord_type coord, dx_type dx, pos_type pos, key_type key) {
+    using Poly = typename Algoim::StencilPoly<dim, poly_order>::T_Poly;
+    Poly field_poly = Poly(coord, *this, dx);
+    // Extension is first done to the temporary field. Otherwise interpolation will be affected.
+    gd.template get<extend_field_temp>(key) = field_poly(pos);
+  }
+template<size_t wrapping_field, typename grid_type, typename wrapping_field_type, size_t N1>
+struct AlgoimWrapper<wrapping_field,grid_type,wrapping_field_type[N1]>
+    const static unsigned int dim = grid_type::dims;
+    grid_type &gd;
+    int patch_id;
+  size_t comp_i;
+    AlgoimWrapper(grid_type& ls_grid, const int pid) : gd(ls_grid), patch_id(pid) {}
+    //! Call operator for the wrapper.
+    double operator() (const blitz::TinyVector<int, dim> idx) const
+    {
+        long int local_key[dim];
+        auto ghost_offset = gd.getLocalGridsInfo().get(patch_id).Dbox.getKP1();
+        for (int d = 0; d < dim; ++d)
+            local_key[d] = idx(d) - algoim_padding;
+        // Generate OpenFPM grid_key object from local grid indices
+        grid_dist_key_dx<dim> grid_key(patch_id, grid_key_dx<dim> (local_key) + ghost_offset);
+        return gd.template get<wrapping_field>(grid_key)[comp_i];
+    }
+  template<size_t extend_field_temp, int poly_order, typename coord_type, typename dx_type, typename pos_type, typename key_type>
+  void extend(coord_type coord, dx_type dx, pos_type pos, key_type key) {
+    using Poly = typename Algoim::StencilPoly<dim, poly_order>::T_Poly;
+    for (int i = 0; i < N1; ++i) {
+      comp_i = i;
+      Poly field_poly = Poly(coord, *this, dx);
+      // Extension is first done to the temporary field. Otherwise interpolation will be affected.
+      gd.template get<extend_field_temp>(key)[i] = field_poly(pos);
+    }
+  }
+template<size_t wrapping_field, typename grid_type, typename wrapping_field_type, size_t N1, size_t N2>
+struct AlgoimWrapper<wrapping_field,grid_type,wrapping_field_type[N1][N2]>
+    const static unsigned int dim = grid_type::dims;
+    grid_type &gd;
+    int patch_id;
+  size_t comp_i, comp_j;
+    AlgoimWrapper(grid_type& ls_grid, const int pid) : gd(ls_grid), patch_id(pid) {}
+    //! Call operator for the wrapper.
+    double operator() (const blitz::TinyVector<int, dim> idx) const
+    {
+        long int local_key[dim];
+        auto ghost_offset = gd.getLocalGridsInfo().get(patch_id).Dbox.getKP1();
+        for (int d = 0; d < dim; ++d)
+            local_key[d] = idx(d) - algoim_padding;
+        // Generate OpenFPM grid_key object from local grid indices
+        grid_dist_key_dx<dim> grid_key(patch_id, grid_key_dx<dim> (local_key) + ghost_offset);
+        return gd.template get<wrapping_field>(grid_key)[comp_i][comp_j];
+    }
+  template<size_t extend_field_temp, int poly_order, typename coord_type, typename dx_type, typename pos_type, typename key_type>
+  void extend(coord_type coord, dx_type dx, pos_type pos, key_type key) {
+    using Poly = typename Algoim::StencilPoly<grid_type::dims, poly_order>::T_Poly;
+    for (int i = 0; i < N1; ++i) {
+      for (int j = 0; j < N2; ++j) {
+	comp_i = i;
+	comp_j = j;
+	Poly field_poly = Poly(coord, *this, dx);
+	// Extension is first done to the temporary field. Otherwise interpolation will be affected.
+	gd.template get<extend_field_temp>(key)[i][j] = field_poly(pos);
+      }
+    }
+  }
+template<size_t wrapping_field, typename grid_type, typename wrapping_field_type, size_t N1, size_t N2, size_t N3>
+struct AlgoimWrapper<wrapping_field,grid_type,wrapping_field_type[N1][N2][N3]>
+    const static unsigned int dim = grid_type::dims;
+    grid_type &gd;
+    int patch_id;
+  size_t comp_i, comp_j, comp_k;
+    AlgoimWrapper(grid_type& ls_grid, const int pid) : gd(ls_grid), patch_id(pid) {}
+    //! Call operator for the wrapper.
+    double operator() (const blitz::TinyVector<int, dim> idx) const
+    {
+        long int local_key[dim];
+        auto ghost_offset = gd.getLocalGridsInfo().get(patch_id).Dbox.getKP1();
+        for (int d = 0; d < dim; ++d)
+            local_key[d] = idx(d) - algoim_padding;
+        // Generate OpenFPM grid_key object from local grid indices
+        grid_dist_key_dx<dim> grid_key(patch_id, grid_key_dx<dim> (local_key) + ghost_offset);
+        return gd.template get<wrapping_field>(grid_key)[comp_i][comp_j][comp_k];
+    }
+  template<size_t extend_field_temp, int poly_order, typename coord_type, typename dx_type, typename pos_type, typename key_type>
+  void extend(coord_type coord, dx_type dx, pos_type pos, key_type key) {
+    using Poly = typename Algoim::StencilPoly<grid_type::dims, poly_order>::T_Poly;
+    for (int i = 0; i < N1; ++i) {
+      for (int j = 0; j < N2; ++j) {
+	for (int k = 0; k < N3; ++k) {
+	  comp_i = i;
+	  comp_j = j;
+	  comp_k = k;
+	  Poly field_poly = Poly(coord, *this, dx);
+	  // Extension is first done to the temporary field. Otherwise interpolation will be affected.
+	  gd.template get<extend_field_temp>(key)[i][j][k] = field_poly(pos);
+	}
+      }
+    }
+  }
 /**@brief Computes the closest point coordinate for each grid point within nb_gamma from interface.
+ * @tparam phi_field Property id on grid for the level set SDF (input)
+ * @tparam cp_field Property id on grid for storing closest point coordinates (output)
+ * @tparam poly_order Type of stencil interpolation (Taylor poly orders between 2 to 5 and Tri/bicubic through -1 is supported)
  * @tparam grid_type Type of the grid container
- * @tparam grid_key_type Type of the key for the grid container
- * @tparam dim Dimension of the space
- * @tparam poly_order Order of the polynomial for stencil interpolation (orders between 2 to 5 is supported)
- * @tparam phi_field Property id on grid for the level set SDF
- * @tparam cp_field Property id on grid for storing closest point coordinates
  * @param gd The distributed grid containing at least level set SDF field and placeholder for closest point coordinates
  * @param nb_gamma The width of the narrow band within which closest point estimation is to be done
+ * 
-template<typename grid_type, typename grid_key_type, unsigned int poly_order, size_t phi_field, size_t cp_field>
+template<size_t phi_field, size_t cp_field, int poly_order, typename grid_type>
 void estimateClosestPoint(grid_type &gd, const double nb_gamma)
     const unsigned int dim = grid_type::dims;
+    // Update the phi field in ghosts
+    gd.template ghost_get<phi_field>(KEEP_PROPERTIES);
     // Stencil polynomial type
     using Poly = typename Algoim::StencilPoly<dim, poly_order>::T_Poly;
@@ -93,7 +233,7 @@ void estimateClosestPoint(grid_type &gd, const double nb_gamma)
             p_hi.set_d(d, patches.get(i).Dbox.getHigh(d) + patches.get(i).origin[d]);
-        AlgoimWrapper<grid_type, grid_key_type, dim, phi_field> phiwrap(gd, i);
+        AlgoimWrapper<phi_field, grid_type> phiwrap(gd, i);
         // Find all cells containing the interface and construct the high-order polynomials
         std::vector<Algoim::detail::CellPoly<dim,Poly>> cells;
@@ -111,8 +251,10 @@ void estimateClosestPoint(grid_type &gd, const double nb_gamma)
         Algoim::KDTree<double,dim> kdtree(points);
+        // In order to ensure that CP is estimated for all points in the narrowband, we add a buffer to the distance check.
+        double nb_gamma_plus_dx = nb_gamma + gd.spacing(0);
         // Pass everything to the closest point computation engine
-        Algoim::ComputeHighOrderCP<dim,Poly> hocp(nb_gamma < std::numeric_limits<double>::max() ? nb_gamma*nb_gamma : std::numeric_limits<double>::max(), // squared bandradius
+        Algoim::ComputeHighOrderCP<dim,Poly> hocp(nb_gamma_plus_dx < std::numeric_limits<double>::max() ? nb_gamma_plus_dx*nb_gamma_plus_dx : std::numeric_limits<double>::max(), // squared bandradius
                                         0.5*blitz::max(dx), // amount that each polynomial overlaps / size of the bounding ball in Newton's method
                                         Algoim::sqr(std::max(1.0e-14, std::pow(blitz::max(dx), Poly::order))), // tolerance to determine convergence
                                         cells, kdtree, points, pointcells, dx, 0.0);
@@ -121,7 +263,7 @@ void estimateClosestPoint(grid_type &gd, const double nb_gamma)
             auto key = it.get();
-            if(std::abs(gd.template get<phi_field>(key)) <= nb_gamma)
+            if(std::abs(gd.template get<phi_field>(key)) < nb_gamma)
                 auto key_g = gd.getGKey(key);
                 // NOTE: This is not the real grid coordinates, but internal coordinates for algoim
@@ -136,8 +278,13 @@ void estimateClosestPoint(grid_type &gd, const double nb_gamma)
+                    std::cout<<"WARN: Closest point computation fails at : ";
                     for(int d = 0; d < dim; ++d)
+                    {
+                        std::cout<<key_g.get(d)<<" ";
                         gd.template get<cp_field>(key)[d] = -100.0;
+                    }
+                    std::cout<<"\n";
@@ -148,22 +295,23 @@ void estimateClosestPoint(grid_type &gd, const double nb_gamma)
 /**@brief Extends a (scalar) field to within nb_gamma from interface. The grid should have level set SDF and closest point field.
- * @tparam grid_type Type of the grid container
- * @tparam grid_key_type Type of the key for the grid container
- * @tparam dim Dimension of the space
- * @tparam poly_order Order of the polynomial for stencil interpolation
  * @tparam phi_field Property id on grid for the level set SDF
  * @tparam cp_field Property id on grid for storing closest point coordinates
  * @tparam extend_field Property id on grid where the field to be extended resides
  * @tparam extend_field_temp Property id on grid for storing temporary intermediate values
+ * @tparam poly_order Type of stencil interpolation (Taylor poly orders between 2 to 5 and Tri/bicubic through -1 is supported)
+ * @tparam grid_type Type of the grid container
  * @param gd The distributed grid containing atleast level set SDF field and closest point coordinates
- * @param nb_gamma The width of the narrow band within which extension is required
+ * @param nb_gamma The width of the narrow band within which extension is required (half band)
-template<typename grid_type, typename grid_key_type, unsigned int poly_order, size_t phi_field, size_t cp_field, size_t extend_field, size_t extend_field_temp>
+template<size_t phi_field, size_t cp_field, size_t extend_field, size_t extend_field_temp, int poly_order, typename grid_type>
 void extendLSField(grid_type &gd, const double nb_gamma)
     const unsigned int dim = grid_type::dims;
+    // Update the phi and cp fields in ghost
+    gd.template ghost_get<phi_field, cp_field, extend_field>(KEEP_PROPERTIES);
     // Stencil polynomial object
     using Poly = typename Algoim::StencilPoly<dim, poly_order>::T_Poly;
     auto &patches = gd.getLocalGridsInfo();
@@ -181,7 +329,7 @@ void extendLSField(grid_type &gd, const double nb_gamma)
             p_lo.set_d(d, patches.get(i).Dbox.getLow(d) + patches.get(i).origin[d]);
             p_hi.set_d(d, patches.get(i).Dbox.getHigh(d) + patches.get(i).origin[d]);
         auto it = gd.getSubDomainIterator(p_lo, p_hi);
@@ -199,44 +347,45 @@ void extendLSField(grid_type &gd, const double nb_gamma)
                     pos(d) = cp_d - coord(d)*gd.spacing(d);
-                AlgoimWrapper<grid_type, grid_key_type, dim, extend_field> fieldwrap(gd,i);
-                Poly field_poly = Poly(coord, fieldwrap, dx);
-                // Extension is first done to the temporary field. Otherwise interpolation will be affected.
-                gd.template get<extend_field_temp>(key) = field_poly(pos);
+                AlgoimWrapper<extend_field, grid_type> fieldwrap(gd,i);
+		fieldwrap.template extend<extend_field_temp,poly_order>(coord,dx,pos,key);
+                // Poly field_poly = Poly(coord, fieldwrap, dx);
+                // // Extension is first done to the temporary field. Otherwise interpolation will be affected.
+                // gd.template get<extend_field_temp>(key) = field_poly(pos);
     // Copy the results to the actual variable
+    typedef typename boost::mpl::at<typename grid_type::value_type::type,boost::mpl::int_<extend_field>>::type type_to_copy;
     auto it = gd.getDomainIterator();
         auto key = it.get();
         if(std::abs(gd.template get<phi_field>(key)) < nb_gamma)
-            gd.template get<extend_field>(key) = gd.template get<extend_field_temp>(key);
+	  meta_copy<type_to_copy>::meta_copy_(gd.template get<extend_field_temp>(key),gd.template get<extend_field>(key));
 /**@brief Reinitializes the level set Phi field on a grid. The grid should have level set SDF and closest point field.
- * @tparam grid_type Type of the grid container
- * @tparam grid_key_type Type of the key for the grid container
- * @tparam dim Dimension of the space
- * @tparam poly_order Order of the polynomial for stencil interpolation
  * @tparam phi_field Property id on grid for the level set SDF
  * @tparam cp_field Property id on grid for storing closest point coordinates
- *
+ * @tparam grid_type Type of the grid container
  * @param gd The distributed grid containing atleast level set SDF field and closest point coordinates
  * @param nb_gamma The width of the narrow band for reinitialization
-template<typename grid_type, typename grid_key_type, unsigned int poly_order, size_t phi_field, size_t cp_field>
+template<size_t phi_field, size_t cp_field, typename grid_type>
 void reinitializeLS(grid_type &gd, const double nb_gamma)
     const unsigned int dim = grid_type::dims;
-    // Stencil polynomial object
-    using Poly = typename Algoim::StencilPoly<dim, poly_order>::T_Poly;
+    // Update the cp_field in ghost
+    gd.template ghost_get<cp_field>(KEEP_PROPERTIES);
     auto &patches = gd.getLocalGridsInfo();
     blitz::TinyVector<double,dim> dx;
     for(int d = 0; d < dim; ++d)
@@ -271,6 +420,9 @@ void reinitializeLS(grid_type &gd, const double nb_gamma)
                     // NOTE: This is not the real grid coordinates, but internal coordinates used for algoim
                     double patch_pos = (key_g.get(d) - p_lo.get(d) + algoim_padding) * gd.spacing(d);
                     double cp_d = gd.template get<cp_field>(key)[d];
+                    if(cp_d == -100.0)
+                        std::cout<<"WARNING: Requesting closest point on nodes where it was not computed."<<std::endl;
                     distance += ((patch_pos - cp_d)*(patch_pos - cp_d));
                 distance = sqrt(distance);
diff --git a/src/level_set/closest_point/closest_point_unit_tests.cpp b/src/level_set/closest_point/closest_point_unit_tests.cpp
index 30bcd7f6..9e2f10b2 100644
--- a/src/level_set/closest_point/closest_point_unit_tests.cpp
+++ b/src/level_set/closest_point/closest_point_unit_tests.cpp
@@ -5,6 +5,7 @@
 #include <boost/test/unit_test_log.hpp>
+#include <cmath>
 #include <boost/test/unit_test.hpp>
 #include <iostream>
@@ -28,21 +29,17 @@ typedef struct EllipseParameters{
 } EllipseParams;
 // Generate an ellipsoid initial levelset signed distance function
-template<typename grid_type, typename domain_type, size_t phi_field>
-void initializeLSEllipsoid(grid_type &gd, const domain_type &domain,  const EllipseParams &params)
+template<size_t phi_field, typename grid_type>
+void initializeLSEllipsoid(grid_type &gd, const EllipseParams &params)
     auto it = gd.getDomainIterator();
-    double dx = gd.getSpacing()[0];
-    double dy = gd.getSpacing()[1];
-    double dz = gd.getSpacing()[2];
         auto key = it.get();
-        auto key_g = gd.getGKey(key);
-        double posx = key_g.get(0)*dx + domain.getLow(0);
-        double posy = key_g.get(1)*dy + domain.getLow(1);
-        double posz = key_g.get(2)*dz + domain.getLow(2);
+        Point<grid_type::dims, double> coords = gd.getPos(key);
+        double posx = coords.get(0);
+        double posy = coords.get(1);
+        double posz = coords.get(2);
         // NOTE: Except for a sphere, this is not the SDF. It is just an implicit function whose zero contour is an ellipsoid.
         double phi_val = 1.0 - sqrt(((posx - params.origin[0])/params.radiusA)*((posx - params.origin[0])/params.radiusA) + ((posy - params.origin[1])/params.radiusB)*((posy - params.origin[1])/params.radiusB) + ((posz - params.origin[2])/params.radiusC)*((posz - params.origin[2])/params.radiusC));
@@ -51,6 +48,36 @@ void initializeLSEllipsoid(grid_type &gd, const domain_type &domain,  const Elli
+// Initialize a scalar field or grid points near the interface
+template<const unsigned int phi, const unsigned int field, typename grid_type>
+void initializeScalarField3D(grid_type &gd, double init_width)
+    auto it = gd.getDomainIterator();
+    // Trying with a L_1 and L_2 spherical harmonics as initial condition for scalar_field
+    double prefactor_l1 = std::sqrt(2.0/(4.0*M_PI));
+    //double prefactor_l2 = std::sqrt(5.0/(16.0*M_PI));
+    while(it.isNext())
+    {
+        auto key = it.get();
+        if(gd.template get<phi>(key) < init_width)
+        {
+            auto coords = gd.getPos(key);
+            double posx = coords.get(0);
+            double posy = coords.get(1);
+            double posz = coords.get(2);
+            double theta = std::atan2(std::sqrt(posx*posx + posy*posy), posz);
+            gd.template get<field>(key) = prefactor_l1 * std::cos(theta);
+            //gd.template get<field>(key) = prefactor_l2 * (3.0 * std::cos(theta) * std::cos(theta) - 1.0);
+        }
+        ++it;
+    }
 BOOST_AUTO_TEST_SUITE( closest_point_test )
@@ -100,12 +127,10 @@ BOOST_AUTO_TEST_CASE( closest_point_unit_sphere )
     nb_gamma = narrow_band_half_width * gdist.spacing(0);
     // Initializes the grid property 'phi' whose zero contour represents the ellipsoid
-    initializeLSEllipsoid<GridDist, Box<SIM_DIM,double>, phi>(gdist, domain, params);
-    gdist.template ghost_get<phi>();
+    initializeLSEllipsoid<phi>(gdist, params);
     // Updates the property 'cp' of the grid to the closest point coords (only done in the narrowband).
-    estimateClosestPoint<GridDist, GridKey, POLY_ORDER, phi, cp>(gdist, nb_gamma);
-    gdist.template ghost_get<cp>();
+    estimateClosestPoint<phi, cp, POLY_ORDER>(gdist, nb_gamma);
     // Estimate error in closest point estimation
     auto &patches = gdist.getLocalGridsInfo();
@@ -127,7 +152,6 @@ BOOST_AUTO_TEST_CASE( closest_point_unit_sphere )
             if(std::abs(gdist.template get<phi>(key)) < nb_gamma)
-                auto key_g = gdist.getGKey(key);
                 // Computed closest point coordinates.
                 // Note: This is patch coordinates not the real one.
                 double cpx = gdist.template get<cp>(key)[x];
@@ -143,9 +167,10 @@ BOOST_AUTO_TEST_CASE( closest_point_unit_sphere )
                 double estim_pz = domain.getLow(z) + (p_zlo - algoim_padding)*gdist.spacing(z) + cpz;
                 // Global coordinate of the selected grid point.
-                double posx = key_g.get(0)*gdist.spacing(0) + domain.getLow(0);
-                double posy = key_g.get(1)*gdist.spacing(1) + domain.getLow(1);
-                double posz = key_g.get(2)*gdist.spacing(2) + domain.getLow(2);
+                Point<GridDist::dims, double> coords = gdist.getPos(key);
+                double posx = coords.get(0);
+                double posy = coords.get(1);
+                double posz = coords.get(2);
                 double norm = sqrt(posx*posx + posy*posy + posz*posz);
                 // Analytically known closest point coordinate for unit sphere.
@@ -213,15 +238,12 @@ BOOST_AUTO_TEST_CASE( reinitialization_unit_sphere )
     nb_gamma = narrow_band_half_width * gdist.spacing(0);
-    initializeLSEllipsoid<GridDist, Box<SIM_DIM,double>, phi>(gdist, domain, params);
-    gdist.template ghost_get<phi>();
+    initializeLSEllipsoid<phi>(gdist, params);
-    estimateClosestPoint<GridDist, GridKey, POLY_ORDER, phi, cp>(gdist, nb_gamma);
-    gdist.template ghost_get<cp>();
+    estimateClosestPoint<phi, cp, POLY_ORDER>(gdist, nb_gamma);
     // Reinitialize the level set function stored in property 'phi' based on closest points in 'cp'
-    reinitializeLS<GridDist, GridKey, POLY_ORDER, phi, cp>(gdist, nb_gamma);
-    gdist.template ghost_get<phi>();
+    reinitializeLS<phi, cp>(gdist, nb_gamma);
     // Estimate error in closest point estimation
     auto &patches = gdist.getLocalGridsInfo();
@@ -242,11 +264,11 @@ BOOST_AUTO_TEST_CASE( reinitialization_unit_sphere )
             if(std::abs(gdist.template get<phi>(key)) < nb_gamma)
-                auto key_g = gdist.getGKey(key);
                 // Global grid coordinate
-                double posx = key_g.get(0)*gdist.spacing(0) + domain.getLow(0);
-                double posy = key_g.get(1)*gdist.spacing(1) + domain.getLow(1);
-                double posz = key_g.get(2)*gdist.spacing(2) + domain.getLow(2);
+                Point<GridDist::dims, double> coords = gdist.getPos(key);
+                double posx = coords.get(0);
+                double posy = coords.get(1);
+                double posz = coords.get(2);
                 // Analytically computed signed distance
                 // NOTE: SDF convention here is positive inside and negative outside the sphere
@@ -269,4 +291,111 @@ BOOST_AUTO_TEST_CASE( reinitialization_unit_sphere )
+BOOST_AUTO_TEST_CASE( extension_unit_sphere )
+    constexpr int SIM_DIM = 3;
+    constexpr int POLY_ORDER = 5;
+    constexpr int SIM_GRID_SIZE = 128;
+    // Fields - phi, cp, scalar_field, scalar_field_temp
+    using GridDist = grid_dist_id<SIM_DIM,double,aggregate<double,double[SIM_DIM],double,double>>;
+    using GridKey = grid_dist_key_dx<SIM_DIM>;
+    // Grid size on each dimension
+    const long int sz[SIM_DIM] = {SIM_GRID_SIZE, SIM_GRID_SIZE, SIM_GRID_SIZE};
+    const size_t szu[SIM_DIM] = {(size_t) sz[0], (size_t) sz[1], (size_t) sz[2]};
+    // 3D physical domain
+    Box<SIM_DIM,double> domain({-1.5,-1.5,-1.5},{1.5,1.5,1.5});
+    constexpr int x = 0;
+    constexpr int y = 1;
+    constexpr int z = 2;
+    // Alias for properties on the grid
+    constexpr int phi = 0;
+    constexpr int cp = 1;
+    constexpr int scalar_field = 2;
+    constexpr int scalar_field_temp = 3;
+    double nb_gamma = 0.0;
+    periodicity<SIM_DIM> grid_bc = {NON_PERIODIC, NON_PERIODIC, NON_PERIODIC};
+    // Ghost in grid units
+    Ghost <SIM_DIM, long int> grid_ghost(2*narrow_band_half_width);
+    GridDist gdist(szu, domain, grid_ghost, grid_bc);
+    EllipseParams params;
+    params.origin[x] = 0.0;
+    params.origin[y] = 0.0;
+    params.origin[z] = 0.0;
+    params.radiusA = 1.0;
+    params.radiusB = 1.0;
+    params.radiusC = 1.0;
+    nb_gamma = narrow_band_half_width * gdist.spacing(0);
+    initializeLSEllipsoid<phi>(gdist, params);
+    estimateClosestPoint<phi, cp, POLY_ORDER>(gdist, nb_gamma);
+    // Reinitialize the level set function stored in property 'phi' based on closest points in 'cp'
+    reinitializeLS<phi, cp>(gdist, nb_gamma);
+    // Initialize a scalar field close to interface
+    initializeScalarField3D<phi,scalar_field>(gdist, 4*gdist.spacing(0));
+    // Extension to the full narrow band
+    extendLSField<phi, cp, scalar_field, scalar_field_temp, -1>(gdist, nb_gamma);
+    double prefactor_l1 = std::sqrt(2.0/(4.0*M_PI));
+    // Estimate error in closest point estimation
+    auto &patches = gdist.getLocalGridsInfo();
+    double max_error = -1.0;
+    for(int i = 0; i < patches.size();i++)
+    {
+        auto p_xlo = patches.get(i).Dbox.getLow(0) + patches.get(i).origin[0];
+        auto p_xhi = patches.get(i).Dbox.getHigh(0) + patches.get(i).origin[0];
+        auto p_ylo = patches.get(i).Dbox.getLow(1) + patches.get(i).origin[1];
+        auto p_yhi = patches.get(i).Dbox.getHigh(1) + patches.get(i).origin[1];
+        auto p_zlo = patches.get(i).Dbox.getLow(2) + patches.get(i).origin[2];
+        auto p_zhi = patches.get(i).Dbox.getHigh(2) + patches.get(i).origin[2];
+        auto it = gdist.getSubDomainIterator({p_xlo, p_ylo, p_zlo}, {p_xhi, p_yhi, p_zhi});
+        while(it.isNext())
+        {
+            auto key = it.get();
+            if(std::abs(gdist.template get<phi>(key)) < nb_gamma)
+            {
+                // Global grid coordinate
+                auto coords = gdist.getPos(key);
+                double posx = coords.get(0);
+                double posy = coords.get(1);
+                double posz = coords.get(2);
+                double theta = std::atan2(std::sqrt(posx*posx + posy*posy), posz);
+                // Analytically computed signed distance
+                // NOTE: SDF convention here is positive inside and negative outside the sphere
+                double exact_val = prefactor_l1 * std::cos(theta);
+                max_error = std::max({std::abs(exact_val - gdist.template get<scalar_field>(key)), max_error});
+            }
+            ++it;
+        }
+    }
+    std::cout<<"Extension error : "<<max_error<<std::endl;
+    double tolerance = 1e-5;
+    bool check;
+    if (std::abs(max_error) < tolerance)
+        check = true;
+    else
+        check = false;
+    BOOST_TEST( check );
\ No newline at end of file
diff --git a/src/util/SphericalHarmonics.hpp b/src/util/SphericalHarmonics.hpp
index b900dbb3..eed0ae43 100644
--- a/src/util/SphericalHarmonics.hpp
+++ b/src/util/SphericalHarmonics.hpp
@@ -2,7 +2,8 @@
 // Created by Abhinav Singh on 03.11.20.
 //#include "util/util_debug.hpp"
 #include <boost/math/special_functions/spherical_harmonic.hpp>
@@ -204,19 +205,19 @@ namespace openfpm {
         return openfpm::math::DYdPhi(n, m, theta, phi, boost::math::policies::policy<>());
-    double sph_A1(int l,int  m,double v1, double vr) {
+    inline double sph_A1(int l,int  m,double v1, double vr) {
         return 0.5 * (1 + l) * (l * v1 - vr);
-    double sph_A2(int l,int  m,double v1, double vr) {
+    inline double sph_A2(int l,int  m,double v1, double vr) {
         return 0.5 * ((1 + l) * (-l) * v1 + (l + 3) * vr);
-    double sph_B(int l, int m,double v2) {
+    inline double sph_B(int l, int m,double v2) {
         return v2;
-    double sph_A3(int l,int m,double v1, double vr) {
+    inline double sph_A3(int l,int m,double v1, double vr) {
         if (m == 1){
             return 0.5 *l* ((1 + l)*v1 - vr)-1.5*sph_A2(l,m,v1,vr);
@@ -225,7 +226,7 @@ namespace openfpm {
-    double sph_A4(int l,int m,double v1, double vr) {
+    inline double sph_A4(int l,int m,double v1, double vr) {
         if (m == 1){
             return 0.5* (-l*(1 + l)*v1 + (2-l)*vr)+0.5*sph_A2(l,m,v1,vr);
@@ -246,7 +247,7 @@ namespace openfpm {
      *  \return std::vector containing the spherical harmonic amplitudes (ur,u1,u2,p) for the solution at r for mode l,m.
-    std::vector<double> sph_anasol_u(double nu,int l,int m,double vr,double v1,double v2,double r) {
+    inline std::vector<double> sph_anasol_u(double nu,int l,int m,double vr,double v1,double v2,double r) {
          double ur,u1,u2,p;
@@ -362,4 +363,6 @@ namespace openfpm {
\ No newline at end of file