diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 06d55055afd565161b6677a0651a00d9df266da2..01440817d846274da44339cdbd3a5116cc77843b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ endif()
 if (NOT CUDA_ON_BACKEND STREQUAL "None")
 	set(CUDA_SOURCES Operators/Vector/vector_dist_operators_unit_tests.cu
 					 DCPSE/DCPSE_op/tests/DCPSE_op_Solver_test.cu
+					 DCPSE/DCPSE_op/tests/DCPSE_op_subset_test.cu
 					 Operators/Vector/vector_dist_operators_apply_kernel_unit_tests.cu)
 endif()
 
@@ -286,7 +287,6 @@ install(FILES DCPSE/Dcpse.hpp
 		DCPSE/DCPSE_op/DCPSE_Solver.cuh
 		DCPSE/DcpseDiagonalScalingMatrix.hpp
 		DCPSE/DcpseRhs.hpp
-		DCPSE/DcpseRhs.cuh
 		DCPSE/Monomial.hpp
 		DCPSE/Monomial.cuh
 		DCPSE/MonomialBasis.hpp
@@ -294,7 +294,6 @@ install(FILES DCPSE/Dcpse.hpp
 		DCPSE/SupportBuilder.cuh
 		DCPSE/SupportBuilder.hpp
 		DCPSE/Vandermonde.hpp
-		DCPSE/Vandermonde.cuh
 		DCPSE/VandermondeRowBuilder.hpp
 		DCPSE/DcpseInterpolation.hpp
 		DESTINATION openfpm_numerics/include/DCPSE
diff --git a/src/DCPSE/DCPSE_op/EqnsStruct.hpp b/src/DCPSE/DCPSE_op/EqnsStruct.hpp
index 8540537b536c3eb6dc1dc309407781054af9dc77..aa55df23158bd74dc2a98ceefefcd3c19b99c6bf 100644
--- a/src/DCPSE/DCPSE_op/EqnsStruct.hpp
+++ b/src/DCPSE/DCPSE_op/EqnsStruct.hpp
@@ -36,18 +36,524 @@ struct equations2d1 {
     typedef petsc_solver<double> solver_type;
 };
 
-//! Specify the general characteristic of system to solve
+struct equations2d2 {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d1p {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d2p {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+
+struct equations2d3p {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d3 {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d4 {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 4;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d3 {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+
+    //! type of space float, double, ..
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d1 {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d3Pz {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
+
+    //! type of space float, double, ..
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d3Pyz {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, PERIODIC,PERIODIC};
+
+    //! type of space float, double, ..
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d3Pxz {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+
+    //! type of space float, double, ..
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d1Pz {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+
+#ifdef __NVCC__
 struct equations2d1_gpu {
 
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims=2;
     //! number of fields in the system
-    static const unsigned int nvar=1;
+    static const unsigned int nvar=1;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d2_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d1p_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d2p_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 2;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d3p_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d3_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations2d4_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 2;
+    //! number of fields in the system
+    static const unsigned int nvar = 4;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d3_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+
+    //! type of space float, double, ..
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d1_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 1;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double, PETSC_BASE> Vector_type;
+
+    typedef petsc_solver<double> solver_type;
+};
+
+struct equations3d3Pz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
 
-    //! type of space float, double, ...
+    //! type of space float, double, ..
     typedef double stype;
 
     //! type of base particles
@@ -62,20 +568,20 @@ struct equations2d1_gpu {
     typedef petsc_solver<double> solver_type;
 };
 
-struct equations2d2 {
+struct equations3d3Pyz_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims = 3;
     //! number of fields in the system
-    static const unsigned int nvar = 2;
+    static const unsigned int nvar = 3;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, PERIODIC,PERIODIC};
 
-    //! type of space float, double, ...
+    //! type of space float, double, ..
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
@@ -86,20 +592,20 @@ struct equations2d2 {
     typedef petsc_solver<double> solver_type;
 };
 
-struct equations2d2_gpu {
+struct equations3d3Pxz_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims = 3;
     //! number of fields in the system
-    static const unsigned int nvar = 2;
+    static const unsigned int nvar = 3;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
 
-    //! type of space float, double, ...
+    //! type of space float, double, ..
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
@@ -110,14 +616,14 @@ struct equations2d2_gpu {
     typedef petsc_solver<double> solver_type;
 };
 
-struct equations2d1p {
+struct equations3d1Pz_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims = 3;
     //! number of fields in the system
     static const unsigned int nvar = 1;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
@@ -133,39 +639,45 @@ struct equations2d1p {
 
     typedef petsc_solver<double> solver_type;
 };
+#endif //__NVCC__
+
+#endif //HAVE_PETSC
+
+
+//! Specify the general characteristic of system to solve
+struct equations2d1E {
 
-struct equations2d1p_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 2;
+    static const unsigned int dims=2;
     //! number of fields in the system
-    static const unsigned int nvar = 1;
+    static const unsigned int nvar=1;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+    typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d2p {
+struct equations2d2E {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
     static const unsigned int nvar = 2;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
@@ -174,23 +686,22 @@ struct equations2d2p {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-
-struct equations2d3p {
+struct equations2d3E {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
     static const unsigned int nvar = 3;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
@@ -199,19 +710,19 @@ struct equations2d3p {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d3 {
+struct equations2d4E {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar = 4;
 
     //! boundary at X and Y
     static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
@@ -223,22 +734,23 @@ struct equations2d3 {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d4 {
+
+struct equations2d1pE {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 4;
+    static const unsigned int nvar = 1;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
@@ -247,47 +759,46 @@ struct equations2d4 {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-
-struct equations3d3 {
+struct equations2d2pE {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 3;
+    static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar = 2;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
 
-    //! type of space float, double, ..
+    //! type of space float, double, ...
     typedef double stype;
 
     //! type of base particles
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d1 {
+struct equations2d3pE {
     //! dimensionaly of the equation ( 3D problem ...)
-    static const unsigned int dims = 3;
+    static const unsigned int dims = 2;
     //! number of fields in the system
-    static const unsigned int nvar = 1;
+    static const unsigned int nvar = 3;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
@@ -296,22 +807,22 @@ struct equations3d1 {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d3Pz {
+struct equations3d3E {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
     static const unsigned int nvar = 3;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
 
     //! type of space float, double, ..
     typedef double stype;
@@ -320,38 +831,39 @@ struct equations3d3Pz {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d3Pyz {
+struct equations3d1E {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar = 1;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,NON_PERIODIC};
 
-    //! type of space float, double, ..
+    //! type of space float, double, ...
     typedef double stype;
 
     //! type of base particles
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
+
 struct equations3d3EPxz {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
@@ -400,14 +912,14 @@ struct equations3d3EPz {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d3Pxz {
+struct equations3d1Pxy {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
-    static const unsigned int nvar = 3;
+    static const unsigned int nvar = 1;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC,NON_PERIODIC};
 
     //! type of space float, double, ..
     typedef double stype;
@@ -424,14 +936,14 @@ struct equations3d3Pxz {
     typedef petsc_solver<double> solver_type;
 };
 
-struct equations3d1Pz {
+struct equations3d1EPxy {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
     static const unsigned int nvar = 1;
 
     //! boundary at X and Y
-    static constexpr bool boundary[]={NON_PERIODIC, NON_PERIODIC,PERIODIC};
+    static constexpr bool boundary[]={PERIODIC, PERIODIC,NON_PERIODIC};
 
     //! type of space float, double, ...
     typedef double stype;
@@ -440,18 +952,17 @@ struct equations3d1Pz {
     typedef vector_dist<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-#endif
 
-//! Specify the general characteristic of system to solve
-struct equations2d1E {
+#ifdef __NVCC__
+struct equations2d1E_gpu {
 
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims=2;
@@ -465,7 +976,7 @@ struct equations2d1E {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -476,7 +987,7 @@ struct equations2d1E {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d2E {
+struct equations2d2E_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -489,7 +1000,7 @@ struct equations2d2E {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -500,7 +1011,7 @@ struct equations2d2E {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d3E {
+struct equations2d3E_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -513,7 +1024,7 @@ struct equations2d3E {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -524,7 +1035,7 @@ struct equations2d3E {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d4E {
+struct equations2d4E_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -537,7 +1048,7 @@ struct equations2d4E {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -549,7 +1060,7 @@ struct equations2d4E {
 };
 
 
-struct equations2d1pE {
+struct equations2d1pE_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -562,7 +1073,7 @@ struct equations2d1pE {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -573,7 +1084,7 @@ struct equations2d1pE {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d2pE {
+struct equations2d2pE_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -586,7 +1097,7 @@ struct equations2d2pE {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -597,7 +1108,7 @@ struct equations2d2pE {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations2d3pE {
+struct equations2d3pE_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 2;
     //! number of fields in the system
@@ -610,7 +1121,7 @@ struct equations2d3pE {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -621,7 +1132,7 @@ struct equations2d3pE {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d3E {
+struct equations3d3E_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
@@ -634,7 +1145,7 @@ struct equations3d3E {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -645,7 +1156,7 @@ struct equations3d3E {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d1E {
+struct equations3d1E_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
@@ -658,7 +1169,7 @@ struct equations3d1E {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -669,7 +1180,55 @@ struct equations3d1E {
     typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d1Pxy {
+struct equations3d3EPxz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, NON_PERIODIC,PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+
+    typedef umfpack_solver<double> solver_type;
+};
+
+struct equations3d3EPz_gpu {
+    //! dimensionaly of the equation ( 3D problem ...)
+    static const unsigned int dims = 3;
+    //! number of fields in the system
+    static const unsigned int nvar = 3;
+
+    //! boundary at X and Y
+    static constexpr bool boundary[]={PERIODIC, PERIODIC,PERIODIC};
+
+    //! type of space float, double, ...
+    typedef double stype;
+
+    //! type of base particles
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
+
+    //! type of SparseMatrix for the linear solver
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
+
+    //! type of Vector for the linear solver
+    typedef Vector<double> Vector_type;
+
+    typedef umfpack_solver<double> solver_type;
+};
+
+struct equations3d1Pxy_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
@@ -682,18 +1241,18 @@ struct equations3d1Pxy {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
-    typedef SparseMatrix<double, int, PETSC_BASE> SparseMatrix_type;
+    typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
 
     //! type of Vector for the linear solver
-    typedef Vector<double, PETSC_BASE> Vector_type;
+    typedef Vector<double> Vector_type;
 
-    typedef petsc_solver<double> solver_type;
+    typedef umfpack_solver<double> solver_type;
 };
 
-struct equations3d1EPxy {
+struct equations3d1EPxy_gpu {
     //! dimensionaly of the equation ( 3D problem ...)
     static const unsigned int dims = 3;
     //! number of fields in the system
@@ -706,7 +1265,7 @@ struct equations3d1EPxy {
     typedef double stype;
 
     //! type of base particles
-    typedef vector_dist<dims, double, aggregate<double>> b_part;
+    typedef vector_dist_gpu<dims, double, aggregate<double>> b_part;
 
     //! type of SparseMatrix for the linear solver
     typedef SparseMatrix<double, int, EIGEN_BASE> SparseMatrix_type;
@@ -716,4 +1275,6 @@ struct equations3d1EPxy {
 
     typedef umfpack_solver<double> solver_type;
 };
+#endif //__NVCC__
+
 #endif //OPENFPM_PDATA_EQNSSTRUCT_HPP
diff --git a/src/DCPSE/DCPSE_op/tests/DCPSE_op_subset_test.cu b/src/DCPSE/DCPSE_op/tests/DCPSE_op_subset_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7903f53e6fe798d3f446cc066c24ecc1aa68992b
--- /dev/null
+++ b/src/DCPSE/DCPSE_op/tests/DCPSE_op_subset_test.cu
@@ -0,0 +1,617 @@
+/*
+ * DCPSE_op_test.cpp
+ *
+ *  Created on: May 15, 2020
+ *      Author: Abhinav Singh
+ *
+ */
+#include "config.h"
+#ifdef HAVE_EIGEN
+#ifdef HAVE_PETSC
+
+
+#define BOOST_TEST_DYN_LINK
+
+#include "util/util_debug.hpp"
+#include <boost/test/unit_test.hpp>
+#include <iostream>
+#include "../DCPSE_op.hpp"
+#include "../DCPSE_Solver.hpp"
+#include "../DCPSE_Solver.cuh"
+#include "Operators/Vector/vector_dist_operators.hpp"
+#include "Vector/vector_dist_subset.hpp"
+#include "../EqnsStruct.hpp"
+
+//template<typename T>
+//struct Debug;
+
+
+
+
+BOOST_AUTO_TEST_SUITE(dcpse_op_subset_suite_tests_cu)
+
+    BOOST_AUTO_TEST_CASE(dcpse_op_subset_tests) {
+        size_t edgeSemiSize = 40;
+        const size_t sz[2] = {2 * edgeSemiSize, 2 * edgeSemiSize};
+        Box<2, double> box({0, 0}, {1.0,1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        int ord = 2;
+        double sampling_factor = 4.0;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+
+        vector_dist_ws_gpu<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double>> Particles(0, box,
+                                                                                                                 bc,
+                                                                                                                 ghost);
+
+        //Init_DCPSE(Particles)
+        BOOST_TEST_MESSAGE("Init Particles...");
+        std::mt19937 rng{6666666};
+
+        std::normal_distribution<> gaussian{0, sigma2};
+
+//        openfpm::vector<aggregate<int>> bulk;
+//        openfpm::vector<aggregate<int>> boundary;
+
+        auto it = Particles.getGridIterator(sz);
+        size_t pointId = 0;
+        size_t counter = 0;
+        double minNormOne = 999;
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double x = k0 * spacing[0];
+            Particles.getLastPos()[0] = x;//+ gaussian(rng);
+            mem_id k1 = key.get(1);
+            double y = k1 * spacing[1];
+            Particles.getLastPos()[1] = y;//+gaussian(rng);
+            // Here fill the function value
+            Particles.template getLastProp<0>() = sin(Particles.getLastPos()[0]) + sin(Particles.getLastPos()[1]);
+
+            if (k0 != 0 && k1 != 0 && k0 != sz[0] -1 && k1 != sz[1] - 1)
+            {
+//              bulk.add();
+//              bulk.template get<0>(bulk.size()-1) = Particles.size_local() - 1;
+
+                Particles.getLastSubset(0);
+            }
+            else
+            {
+//                boundary.add();
+//                boundary.template get<0>(boundary.size()-1) = Particles.size_local() - 1;
+                Particles.getLastSubset(1);
+            }
+            
+
+            ++counter;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync Particles across processors...");
+
+        Particles.map();
+        Particles.ghost_get<0>();
+
+        auto git = Particles.getGhostIterator();
+
+        while (git.isNext())
+        {
+            auto p = git.get();
+
+            Particles.template getProp<0>(p) = std::numeric_limits<double>::quiet_NaN();
+
+            ++git;
+        }
+
+        vector_dist_subset_gpu<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double>> Particles_bulk(Particles,0);
+        vector_dist_subset_gpu<2, double, aggregate<double, double, double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double>> Particles_boundary(Particles,1);
+        auto & boundary = Particles_boundary.getIds();
+
+        // move particles
+        auto P = getV<0>(Particles);
+        auto Out = getV<1>(Particles);
+        auto Pb = getV<2>(Particles);
+        auto Out_V = getV<3>(Particles);
+
+
+        auto P_bulk = getV<2>(Particles_bulk);
+        auto Out_bulk = getV<1>(Particles_bulk);
+        auto Out_V_bulk = getV<3>(Particles_bulk);
+        Out=10;
+        P_bulk = 5;
+
+        P_bulk=Pb+Out;
+//        Particles.write("Test_output_subset");
+
+        // Create the subset
+
+       /* Derivative_x Dx(Particles, 2, rCut);
+        Derivative_y Dy(Particles, 2, rCut);
+        Derivative_x Dx_bulk(Particles_bulk, 2, rCut);
+*/
+        Derivative_x_gpu Dx_bulk(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y_gpu Dy_bulk(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+
+        Out_bulk = Dx_bulk(P);
+        Out_V_bulk[0] = P + Dx_bulk(P);
+        Out_V_bulk[1] = Out_V[0] +Dy_bulk(P);
+
+    // Check
+        bool is_nan = false;
+
+        auto & v_cl = create_vcluster();
+        if (v_cl.size() > 1)
+        {
+            auto it2 = Particles_bulk.getDomainIterator();
+            while (it2.isNext())
+            {
+            auto p = it2.get();
+
+    /*      BOOST_REQUIRE_EQUAL(Particles_bulk.getProp<2>(p),15.0);
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<1>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.005 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[0] - Particles_bulk.getProp<0>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.001 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[1] - Particles_bulk.getProp<3>(p)[0] - cos(Particles_bulk.getPos(p)[1])) < 0.001 );*/
+
+                is_nan |= std::isnan(Particles_bulk.template getProp<1>(p));
+
+    //        Particles_bulk.template getProp<0>(p) = fabs(Particles_bulk.getProp<1>(p) - cos(Particles_bulk.getPos(p)[0]));
+
+                ++it2;
+            }
+
+            BOOST_REQUIRE_EQUAL(is_nan,true);
+        }
+
+//        P_bulk = Dx_bulk(P_bulk);  <------------ Incorrect produce error message
+//        P = Dx_bulk(P);   <------- Incorrect produce overflow
+
+        Particles.ghost_get<0>();
+
+        for (int i = 0 ; i < boundary.size() ; i++)
+        {
+            Particles.template getProp<0>(boundary.template get<0>(i)) = std::numeric_limits<double>::quiet_NaN();
+        }
+
+        Particles.ghost_get<0>();
+
+        Out_bulk = Dx_bulk(P);
+        Out_V_bulk[0] = P + Dx_bulk(P);
+        Out_V_bulk[1] = Out_V[0] +Dy_bulk(P);
+
+        auto it2 = Particles_bulk.getDomainIterator();
+        while (it2.isNext())
+        {
+            auto p = it2.get();
+
+            BOOST_REQUIRE_EQUAL(Particles_bulk.getProp<2>(p),15.0);
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<1>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.005 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[0] - Particles_bulk.getProp<0>(p) - cos(Particles_bulk.getPos(p)[0])) < 0.001 );
+            BOOST_REQUIRE(fabs(Particles_bulk.getProp<3>(p)[1] - Particles_bulk.getProp<3>(p)[0] - cos(Particles_bulk.getPos(p)[1])) < 0.001 );
+
+            ++it2;
+        }
+
+
+
+    }
+
+    BOOST_AUTO_TEST_CASE(dcpse_op_subset_PC_lid) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        constexpr int x = 0;
+        constexpr int y = 1;
+        size_t edgeSemiSize = 20;
+        const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
+        Box<2, double> box({0, 0}, {1.0,1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        int ord = 2;
+        double sampling_factor = 4.0;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        double sigma2 = spacing[0] * spacing[1] / (2 * 4);
+        auto &v_cl = create_vcluster();
+
+        typedef  aggregate<double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double,VectorS<2, double>,VectorS<2, double>,double> particle_type;
+
+        vector_dist_ws_gpu<2, double, particle_type> Particles(0, box,bc,ghost);
+
+        //Init_DCPSE(Particles)
+        BOOST_TEST_MESSAGE("Init Particles...");
+
+//        openfpm::vector<aggregate<int>> bulk;
+//        openfpm::vector<aggregate<int>> boundary;
+
+        auto it = Particles.getGridIterator(sz);
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double xp0 = k0 * spacing[0];
+            Particles.getLastPos()[0] = xp0;
+            mem_id k1 = key.get(1);
+            double yp0 = k1 * spacing[1];
+            Particles.getLastPos()[1] = yp0;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync Particles across processors...");
+        Particles.map();
+        Particles.ghost_get<0>();
+
+        auto it2 = Particles.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = Particles.getPos(p);
+            if (xp[0] != 0 && xp[1] != 0 && xp[0] != 1.0 && xp[1] != 1.0) {
+//                bulk.add();
+//                bulk.last().get<0>() = p.getKey();
+                Particles.setSubset(p,0);
+                Particles.getProp<3>(p)[x] = 3.0;
+                Particles.getProp<3>(p)[y] = 3.0;
+            } else {
+//                boundary.add();
+//                boundary.last().get<0>() = p.getKey();
+                Particles.setSubset(p,1);
+                Particles.getProp<3>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+                Particles.getProp<3>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            }
+            Particles.getProp<6>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+            Particles.getProp<6>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            Particles.getProp<7>(p) = xp[0]+xp[1]-1.0;
+
+            ++it2;
+        }
+
+        vector_dist_subset_gpu<2, double, particle_type> Particles_bulk(Particles,0);
+        vector_dist_subset_gpu<2, double, particle_type> Particles_boundary(Particles,1);
+        auto & bulk = Particles_bulk.getIds();
+        auto & boundary = Particles_boundary.getIds();
+
+        auto P = getV<0>(Particles);
+        auto V = getV<1>(Particles);
+        auto RHS = getV<2>(Particles);
+        auto dV = getV<3>(Particles);
+        auto div = getV<4>(Particles);
+        auto V_star = getV<5>(Particles);
+
+
+        auto P_bulk = getV<0>(Particles_bulk);
+        auto RHS_bulk =getV<2>(Particles_bulk);
+
+        P_bulk = 0;
+
+        Derivative_x_gpu Dx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_xx_gpu Dxx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_yy_gpu Dyy(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y_gpu Dy(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_x_gpu Bulk_Dx(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y_gpu Bulk_Dy(Particles_bulk, 2, rCut,sampling_factor, support_options::RADIUS);
+
+        int n = 0, nmax = 5, ctr = 0, errctr=1, Vreset = 0;
+        double V_err=1;
+        if (Vreset == 1) {
+            P_bulk = 0;
+            P = 0;
+            Vreset = 0;
+        }
+        P=0;
+        eq_id vx,vy;
+        vx.setId(0);
+        vy.setId(1);
+        double sum, sum1, sum_k,V_err_eps=1e-3,V_err_old;
+        auto Stokes1=Dxx(V[x])+Dyy(V[x]);
+        auto Stokes2=Dxx(V[y])+Dyy(V[y]);
+
+        petsc_solver<double> solverPetsc;
+        //solverPetsc.setSolver(KSPGMRES);
+        //solverPetsc.setRestart(250);
+        //solverPetsc.setPreconditioner(PCJACOBI);
+        V_star=0;
+        RHS[x] = dV[x];
+        RHS[y] = dV[y];
+        while (V_err >= V_err_eps && n <= nmax) {
+            Particles.ghost_get<0>(SKIP_LABELLING);
+            RHS_bulk[x] = dV[x] + Bulk_Dx(P);
+            RHS_bulk[y] = dV[y] + Bulk_Dy(P);
+            DCPSE_scheme_gpu<equations2d2_gpu, decltype(Particles)> Solver(Particles);
+            Solver.impose(Stokes1, bulk, RHS[0], vx);
+            Solver.impose(Stokes2, bulk, RHS[1], vy);
+            Solver.impose(V[x], boundary, RHS[0], vx);
+            Solver.impose(V[y], boundary, RHS[1], vy);
+
+            /*auto A=Solver.getA(options_solver::STANDARD);
+            //A.getMatrixTriplets().save("Tripletes");
+            A.write("Mat_lid");*/
+
+            Solver.solve_with_solver(solverPetsc, V[x], V[y]);
+            Particles.ghost_get<1>(SKIP_LABELLING);
+            div = -(Dx(V[x]) + Dy(V[y]));
+            P_bulk = P + div;
+            sum = 0;
+            sum1 = 0;
+
+            for (int j = 0; j < bulk.size(); j++) {
+                auto p = bulk.get<0>(j);
+                sum += (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) *
+                       (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) +
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]) *
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]);
+                sum1 += Particles.getProp<1>(p)[0] * Particles.getProp<1>(p)[0] +
+                        Particles.getProp<1>(p)[1] * Particles.getProp<1>(p)[1];
+            }
+
+            sum = sqrt(sum);
+            sum1 = sqrt(sum1);
+            V_star = V;
+            v_cl.sum(sum);
+            v_cl.sum(sum1);
+            v_cl.execute();
+            V_err_old = V_err;
+            V_err = sum / sum1;
+            if (V_err > V_err_old || abs(V_err_old - V_err) < 1e-8) {
+                errctr++;
+                //alpha_P -= 0.1;
+            } else {
+                errctr = 0;
+            }
+            if (n > 3) {
+                if (errctr > 3) {
+                    std::cout << "CONVERGENCE LOOP BROKEN DUE TO INCREASE/VERY SLOW DECREASE IN ERROR" << std::endl;
+                    Vreset = 1;
+                    break;
+                } else {
+                    Vreset = 0;
+                }
+            }
+            n++;
+            if (v_cl.rank() == 0) {
+                std::cout << "Rel l2 cgs err in V = " << V_err << " at " << n << std::endl;
+            }
+        }
+        double worst1 = 0.0;
+        double worst2 = 0.0;
+
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]) >= worst1) {
+                worst1 = fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]);
+            }
+        }
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]) >= worst2) {
+                worst2 = fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]);
+            }
+        }
+        //Particles.deleteGhost();
+        //Particles.write("PC_subset_lid");
+        std::cout << "Maximum Analytic Error in Vx: " << worst1 << std::endl;
+        std::cout << "Maximum Analytic Error in Vy: " << worst2 << std::endl;
+        BOOST_REQUIRE(worst1 < 0.03);
+        BOOST_REQUIRE(worst2 < 0.03);
+
+    }
+
+
+    BOOST_AUTO_TEST_CASE(dcpse_op_subset_PC_lid2) {
+//  int rank;
+//  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        constexpr int x = 0;
+        constexpr int y = 1;
+        size_t edgeSemiSize = 20;
+        const size_t sz[2] = {2 * edgeSemiSize+1, 2 * edgeSemiSize+1};
+        Box<2, double> box({0, 0}, {1.0,1.0});
+        size_t bc[2] = {NON_PERIODIC, NON_PERIODIC};
+        double spacing[2];
+        spacing[0] = 1.0 / (sz[0] - 1);
+        spacing[1] = 1.0 / (sz[1] - 1);
+        double rCut = 3.9 * spacing[0];
+        int ord = 2;
+        double sampling_factor = 4.0;
+        Ghost<2, double> ghost(rCut);
+        BOOST_TEST_MESSAGE("Init vector_dist...");
+        auto &v_cl = create_vcluster();
+
+        vector_dist<2, double, aggregate<double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double,VectorS<2, double>,VectorS<2, double>,double>> Particles(0, box,
+                                                                                                                                                 bc,
+                                                                                                                                                 ghost);
+        vector_dist<2, double, aggregate<double, VectorS<2, double>, VectorS<2, double>,VectorS<2, double>,double,VectorS<2, double>,VectorS<2, double>,double>> Particles_subset(Particles.getDecomposition(), 0);
+
+
+        //Init_DCPSE(Particles)
+        BOOST_TEST_MESSAGE("Init Particles...");
+
+        openfpm::vector<aggregate<int>> bulk;
+        openfpm::vector<aggregate<int>> boundary;
+
+        auto it = Particles.getGridIterator(sz);
+        size_t pointId = 0;
+        double minNormOne = 999;
+        while (it.isNext())
+        {
+            Particles.add();
+            auto key = it.get();
+            mem_id k0 = key.get(0);
+            double xp0 = k0 * spacing[0];
+            Particles.getLastPos()[0] = xp0;
+            mem_id k1 = key.get(1);
+            double yp0 = k1 * spacing[1];
+            Particles.getLastPos()[1] = yp0;
+            ++it;
+        }
+        BOOST_TEST_MESSAGE("Sync Particles across processors...");
+        Particles.map();
+        Particles.ghost_get<0>();
+        auto it2 = Particles.getDomainIterator();
+        while (it2.isNext()) {
+            auto p = it2.get();
+            Point<2, double> xp = Particles.getPos(p);
+            if (xp[0] != 0 && xp[1] != 0 && xp[0] != 1.0 && xp[1] != 1.0) {
+                bulk.add();
+                bulk.last().get<0>() = p.getKey();
+                Particles.getProp<3>(p)[x] = 3.0;
+                Particles.getProp<3>(p)[y] = 3.0;
+            } else {
+                boundary.add();
+                boundary.last().get<0>() = p.getKey();
+                Particles.getProp<3>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+                Particles.getProp<3>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            }
+            Particles.getProp<6>(p)[x] = xp[0]*xp[0]+xp[1]*xp[1];
+            Particles.getProp<6>(p)[y] = xp[0]*xp[0]-2*xp[0]*xp[1];
+            Particles.getProp<7>(p) = xp[0]+xp[1]-1.0;
+
+            ++it2;
+        }
+
+        for (int i = 0; i < bulk.size(); i++) {
+            Particles_subset.add();
+            Particles_subset.getLastPos()[0] = Particles.getPos(bulk.template get<0>(i))[0];
+            Particles_subset.getLastPos()[1] = Particles.getPos(bulk.template get<0>(i))[1];
+        }
+        Particles_subset.map();
+        Particles_subset.ghost_get<0>();
+
+
+
+        auto P = getV<0>(Particles);
+        auto V = getV<1>(Particles);
+        auto RHS = getV<2>(Particles);
+        auto dV = getV<3>(Particles);
+        auto div = getV<4>(Particles);
+        auto V_star = getV<5>(Particles);
+
+        auto P_bulk = getV<0>(Particles_subset);
+        auto Grad_bulk= getV<2>(Particles_subset);
+
+        P_bulk = 0;
+
+        Derivative_x Dx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_x Bulk_Dx(Particles_subset, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_xx Dxx(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_yy Dyy(Particles, 2, rCut,sampling_factor, support_options::RADIUS);
+        Derivative_y Dy(Particles, 2, rCut,sampling_factor, support_options::RADIUS),Bulk_Dy(Particles_subset, 2, rCut,sampling_factor, support_options::RADIUS);;
+
+        int n = 0, nmax = 5, ctr = 0, errctr=0, Vreset = 0;
+        double V_err=1;
+        if (Vreset == 1) {
+            P_bulk = 0;
+            P = 0;
+            Vreset = 0;
+        }
+        P=0;
+        eq_id vx,vy;
+        vx.setId(0);
+        vy.setId(1);
+        double sum, sum1, sum_k,V_err_eps=1e-3,V_err_old;
+        auto Stokes1=Dxx(V[x])+Dyy(V[x]);
+        auto Stokes2=Dxx(V[y])+Dyy(V[y]);
+
+        petsc_solver<double> solverPetsc;
+        //solverPetsc.setSolver(KSPGMRES);
+        //solverPetsc.setRestart(250);
+        //solverPetsc.setPreconditioner(PCJACOBI);
+        V_star=0;
+        while (V_err >= V_err_eps && n <= nmax) {
+            RHS[x] = dV[x];
+            RHS[y] = dV[y];
+            Particles_subset.ghost_get<0>(SKIP_LABELLING);
+
+            Grad_bulk[x] = Bulk_Dx(P_bulk);
+            Grad_bulk[y] = Bulk_Dy(P_bulk);
+            for (int i = 0; i < bulk.size(); i++) {
+                Particles.template getProp<2>(bulk.template get<0>(i))[x] += Particles_subset.getProp<2>(i)[x];
+                Particles.template getProp<2>(bulk.template get<0>(i))[y] += Particles_subset.getProp<2>(i)[y];
+            }
+
+            DCPSE_scheme<equations2d2_gpu, decltype(Particles)> Solver(Particles);
+            Solver.impose(Stokes1, bulk, RHS[0], vx);
+            Solver.impose(Stokes2, bulk, RHS[1], vy);
+            Solver.impose(V[x], boundary, RHS[0], vx);
+            Solver.impose(V[y], boundary, RHS[1], vy);
+            Solver.solve_with_solver(solverPetsc, V[x], V[y]);
+            Particles.ghost_get<1>(SKIP_LABELLING);
+            div = -(Dx(V[x]) + Dy(V[y]));
+            P = P + div;
+            for (int i = 0; i < bulk.size(); i++) {
+                Particles_subset.getProp<0>(i) = Particles.template getProp<0>(bulk.template get<0>(i));
+            }
+            sum = 0;
+            sum1 = 0;
+
+            for (int j = 0; j < bulk.size(); j++) {
+                auto p = bulk.get<0>(j);
+                sum += (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) *
+                       (Particles.getProp<5>(p)[0] - Particles.getProp<1>(p)[0]) +
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]) *
+                       (Particles.getProp<5>(p)[1] - Particles.getProp<1>(p)[1]);
+                sum1 += Particles.getProp<1>(p)[0] * Particles.getProp<1>(p)[0] +
+                        Particles.getProp<1>(p)[1] * Particles.getProp<1>(p)[1];
+            }
+
+            sum = sqrt(sum);
+            sum1 = sqrt(sum1);
+            V_star=V;
+            v_cl.sum(sum);
+            v_cl.sum(sum1);
+            v_cl.execute();
+            V_err_old = V_err;
+            V_err = sum / sum1;
+            if (V_err > V_err_old || abs(V_err_old - V_err) < 1e-8) {
+                errctr++;
+                //alpha_P -= 0.1;
+            } else {
+                errctr = 0;
+            }
+            if (n > 3) {
+                if (errctr > 3) {
+                    std::cout << "CONVERGENCE LOOP BROKEN DUE TO INCREASE/VERY SLOW DECREASE IN ERROR" << std::endl;
+                    Vreset = 1;
+                    break;
+                } else {
+                    Vreset = 0;
+                }
+            }
+            n++;
+            if (v_cl.rank() == 0) {
+            std::cout << "Rel l2 cgs err in V = " << V_err << " at " << n << std::endl;
+
+            }
+        }
+        double worst1 = 0.0;
+        double worst2 = 0.0;
+
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]) >= worst1) {
+                worst1 = fabs(Particles.getProp<6>(p)[0] - Particles.getProp<1>(p)[0]);
+            }
+        }
+        for(int j=0;j<bulk.size();j++)
+        {   auto p=bulk.get<0>(j);
+            if (fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]) >= worst2) {
+                worst2 = fabs(Particles.getProp<6>(p)[1] - Particles.getProp<1>(p)[1]);
+            }
+        }
+
+        std::cout << "Maximum Analytic Error in slice x: " << worst1 << std::endl;
+        std::cout << "Maximum Analytic Error in slice y: " << worst2 << std::endl;
+        BOOST_REQUIRE(worst1 < 0.03);
+        BOOST_REQUIRE(worst2 < 0.03);
+
+        //Particles.write("PC_subset_lid2");
+    }
+
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+#endif
\ No newline at end of file
diff --git a/src/DCPSE/Dcpse.cuh b/src/DCPSE/Dcpse.cuh
index 01ef4828ecdeb085657d1c3761439d4c20657376..327265041093a92c577c2b48b3c3e957aba47bda 100644
--- a/src/DCPSE/Dcpse.cuh
+++ b/src/DCPSE/Dcpse.cuh
@@ -13,10 +13,8 @@
 #include "SupportBuilder.cuh"
 #include "Support.hpp"
 #include "Vandermonde.hpp"
-#include "Vandermonde.cuh"
 #include "DcpseDiagonalScalingMatrix.hpp"
 #include "DcpseRhs.hpp"
-#include "DcpseRhs.cuh"
 
 #include <chrono>
 
@@ -31,10 +29,7 @@ __global__ void calcKernels_gpu(particles_type, monomialBasis_type, supportKey_t
 
 template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
 __global__ void assembleLocalMatrices_gpu( particles_type, Point<dim, unsigned int>, unsigned int, monomialBasis_type, supportKey_type, supportKey_type, supportKey_type,
-    T**, T**, localEps_type, localEps_type, matrix_type, matrix_type, matrix_type, size_t, size_t);
-
-template <unsigned int dim, typename T, typename monomialBasis_type>
-__device__ T computeKernel_gpu(Point<dim, T>&, const T*, const monomialBasis_type&);
+    T**, T**, localEps_type, localEps_type, matrix_type, size_t, size_t);
 
 
 template<unsigned int dim, typename vector_type, class T = typename vector_type::stype>
@@ -69,6 +64,8 @@ private:
     openfpm::vector_custd<T> localEpsInvPow; // Each MPI rank has just access to the local ones
     openfpm::vector_custd<T> calcKernels;
 
+    openfpm::vector<size_t> subsetKeyPid;
+
     vector_type & particles;
     double rCut;
     unsigned int convergenceOrder;
@@ -122,6 +119,7 @@ public:
             differentialSignature(differentialSignature),
             differentialOrder(Monomial<dim>(differentialSignature).order()),
             monomialBasis(differentialSignature.asArray(), convergenceOrder),
+            subsetKeyPid(other.subsetKeyPid),
             supportRefs(other.supportRefs),
             supportKeys1D(other.supportKeys1D),
             kerOffsets(other.kerOffsets),
@@ -200,7 +198,7 @@ public:
             momenta.template get<1>(i) = -3000000000.0;
         }
 
-        size_t N = particles.size_local_orig();
+        size_t N = particles.size_local();
         for (size_t j = 0; j < N; ++j)
         {
             double eps = localEps.get(j);
@@ -220,7 +218,7 @@ public:
             for (int i = 0; i < supportKeysSize; i++)
             {
                 size_t xqK = supportKeys[i];
-                Point<dim, T> xq = particles.getPosOrig(xqK);
+                Point<dim, T> xq = particles.getPos(xqK);
                 Point<dim, T> normalizedArg = (xp - xq) / eps;
 
                 auto ker = calcKernels.get(kerOff+i);
@@ -274,7 +272,7 @@ public:
             sign = -1;
         }
 
-        size_t N = particles.size_local_orig();
+        size_t N = particles.size_local();
         for (size_t j = 0; j < N; ++j)
         {
             double epsInvPow = localEpsInvPow.get(j);
@@ -373,8 +371,9 @@ public:
             sign = -1;
         }
 
-        double eps = localEps.get(key.getKey());
-        double epsInvPow = localEpsInvPow.get(key.getKey());
+        size_t localKey = subsetKeyPid.get(key.getKey());
+        double eps = localEps.get(localKey);
+        double epsInvPow = localEpsInvPow.get(localKey);
 
         auto &particles = o1.getVector();
 
@@ -386,13 +385,13 @@ public:
 #endif
 
         expr_type Dfxp = 0;
-        size_t xpK = supportRefs.get(key.getKey());
+        size_t xpK = supportRefs.get(localKey);
         Point<dim, T> xp = particles.getPos(xpK);
         expr_type fxp = sign * o1.value(key);
         size_t kerOff = kerOffsets.get(xpK);
 
-        size_t  supportKeysSize = kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
-        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
+        size_t  supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
 
         for (int i = 0; i < supportKeysSize; i++)
         {
@@ -401,8 +400,8 @@ public:
             Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+i);
         }
         Dfxp = Dfxp * epsInvPow;
-        //
-        //T trueDfxp = particles.template getProp<2>(xpK);
+
+        // T trueDfxp = particles.template getProp<2>(xpK);
         // Store Dfxp in the right position
         return Dfxp;
     }
@@ -429,8 +428,9 @@ public:
             sign = -1;
         }
 
-        double eps = localEps.get(key.getKey());
-        double epsInvPow = localEpsInvPow(key.getKey());
+        size_t localKey = subsetKeyPid.get(key.getKey());
+        double eps = localEps.get(localKey);
+        double epsInvPow = localEpsInvPow(localKey);
 
         auto &particles = o1.getVector();
 
@@ -442,13 +442,13 @@ public:
 #endif
 
         expr_type Dfxp = 0;
-        size_t xpK = supportRefs.get(key.getKey());
+        size_t xpK = supportRefs.get(localKey);
 
         Point<dim, T> xp = particles.getPos(xpK);
         expr_type fxp = sign * o1.value(key)[i];
         size_t kerOff = kerOffsets.get(xpK);
-        size_t  supportKeysSize = kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
-        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
+        size_t  supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
+        size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
 
         for (int j = 0; j < supportKeysSize; j++)
         {
@@ -475,6 +475,7 @@ public:
         localEps.clear();
         localEpsInvPow.clear();
         calcKernels.clear();
+        subsetKeyPid.clear();
 
         initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
     }
@@ -488,38 +489,38 @@ private:
 #ifdef SE_CLASS1
         this->update_ctr=particles.getMapCtr();
 #endif
-        SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
-        unsigned int requiredSupportSize = monomialBasis.size();
-
-        if (!isSharedSupport)
-            supportRefs.resize(particles.size_local_orig());
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
-        kerOffsets.resize(particles.size_local_orig()+1);
-
-        // need to resize supportKeys1D to yet unknown supportKeysTotalN
-        // add() takes too long
-        openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
-        const T condVTOL = 1e2;
 
-        auto it = particles.getDomainIterator();
-        while (it.isNext()) {
-            size_t sz;
-            auto key_o = particles.getOriginKey(it.get());
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
+        kerOffsets.resize(particles.size_local()+1);
 
+        const T condVTOL = 1e2;
 
-            if (!isSharedSupport){
+        if (!isSharedSupport) {
+            SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
+            unsigned int requiredSupportSize = monomialBasis.size();
+            // need to resize supportKeys1D to yet unknown supportKeysTotalN
+            // add() takes too long
+            openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+
+            auto it = particles.getDomainIterator();
+            while (it.isNext()) {
                 auto key_o = particles.getOriginKey(it.get());
+                subsetKeyPid.get(key_o.getKey()) = it.get().getKey();
 
                 Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
-                supportRefs.get(key_o.getKey()) = support.getReferencePointKey();
+                supportRefs.get(key_o.getKey()) = key_o.getKey();
                 tempSupportKeys.get(key_o.getKey()) = support.getKeys();
                 kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
 
-                if (maxSupportSize < support.size()) maxSupportSize = support.size();
+                if (maxSupportSize < support.size())
+                    maxSupportSize = support.size();
                 supportKeysTotalN += support.size();
 
-
                 EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
                 // Vandermonde matrix computation
                 Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -534,11 +535,10 @@ private:
                     std::cout << "INFO: Increasing, requiredSupportSize = " << requiredSupportSize << std::endl; // debug
                     continue;
                 } else requiredSupportSize = monomialBasis.size();
+
+                ++it;
             }
-            ++it;
-        }
 
-        if (!isSharedSupport){
             kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
             supportKeys1D.resize(supportKeysTotalN);
 
@@ -559,38 +559,38 @@ private:
 #ifdef SE_CLASS1
         this->update_ctr=particles.getMapCtr();
 #endif
-        SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
-        unsigned int requiredSupportSize = monomialBasis.size();
-
-        if (!isSharedSupport)
-            supportRefs.resize(particles.size_local_orig());
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
-        kerOffsets.resize(particles.size_local_orig()+1);
-
-        // need to resize supportKeys1D to yet unknown supportKeysTotalN
-        // add() takes too long
-        openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
-        const T condVTOL = 1e2;
 
-        auto it = particles.getDomainIterator();
-        while (it.isNext()) {
-            size_t sz;
-            auto key_o = particles.getOriginKey(it.get());
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
+        kerOffsets.resize(particles.size_local()+1);
 
+        const T condVTOL = 1e2;
 
-            if (!isSharedSupport){
+        if (!isSharedSupport) {
+            SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
+            unsigned int requiredSupportSize = monomialBasis.size();
+            // need to resize supportKeys1D to yet unknown supportKeysTotalN
+            // add() takes too long
+            openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+
+            auto it = particles.getDomainIterator();
+            while (it.isNext()) {
                 auto key_o = particles.getOriginKey(it.get());
+                subsetKeyPid.get(key_o.getKey()) = it.get().getKey();
 
                 Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
-                supportRefs.get(key_o.getKey()) = support.getReferencePointKey();
+                supportRefs.get(key_o.getKey()) = key_o.getKey();
                 tempSupportKeys.get(key_o.getKey()) = support.getKeys();
                 kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
 
-                if (maxSupportSize < support.size()) maxSupportSize = support.size();
+                if (maxSupportSize < support.size())
+                    maxSupportSize = support.size();
                 supportKeysTotalN += support.size();
 
-
                 EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> V(support.size(), monomialBasis.size());
                 // Vandermonde matrix computation
                 Vandermonde<dim, T, EMatrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -605,11 +605,10 @@ private:
                     std::cout << "INFO: Increasing, requiredSupportSize = " << requiredSupportSize << std::endl; // debug
                     continue;
                 } else requiredSupportSize = monomialBasis.size();
+
+                ++it;
             }
-            ++it;
-        }
 
-        if (!isSharedSupport){
             kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
             supportKeys1D.resize(supportKeysTotalN);
 
@@ -634,51 +633,49 @@ private:
         this->supportSizeFactor=supportSizeFactor;
         this->convergenceOrder=convergenceOrder;
 
-        if (!isSharedSupport)
-            supportRefs.resize(particles.size_local_orig());
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
 
 std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
         auto it = particles.getDomainIterator();
 
         if (opt==support_options::RADIUS) {
-            while (it.isNext()) {
-                auto key_o = particles.getOriginKey(it.get());
-
-                if (!isSharedSupport)
+            if (!isSharedSupport) {
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
                     supportRefs.get(key_o.getKey()) = key_o.getKey();
-                ++it;
-            }
-
-            SupportBuilderGPU<vector_type> supportBuilder(particles, differentialSignature, rCut);
-            supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
+                    ++it;
+                }
 
+                SupportBuilderGPU<vector_type> supportBuilder(particles, rCut);
+                supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
+            }
         } else {
-            size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
-            SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
-
-            kerOffsets.resize(supportRefs.size()+1);
-            // need to resize supportKeys1D to yet unknown supportKeysTotalN
-            // add() takes too long
-            openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+            if (!isSharedSupport){
+                openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+                size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
+                // need to resize supportKeys1D to yet unknown supportKeysTotalN
+                // add() takes too long
+                SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
+                kerOffsets.resize(supportRefs.size()+1);
 
-            while (it.isNext()) {
-                if (!isSharedSupport){
-                    auto key_o = particles.getOriginKey(it.get());
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
 
                     Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
-                    supportRefs.get(key_o.getKey()) = support.getReferencePointKey();
+                    supportRefs.get(key_o.getKey()) = key_o.getKey();
                     tempSupportKeys.get(key_o.getKey()) = support.getKeys();
                     kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
 
                     if (maxSupportSize < support.size()) maxSupportSize = support.size();
                     supportKeysTotalN += support.size();
+                    ++it;
                 }
-                ++it;
-            }
 
-            if (!isSharedSupport){
                 kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
                 supportKeys1D.resize(supportKeysTotalN);
 
@@ -710,52 +707,49 @@ std::cout << "Support building took " << time_span2.count() * 1000. << " millise
         this->supportSizeFactor=supportSizeFactor;
         this->convergenceOrder=convergenceOrder;
 
-        if (!isSharedSupport)
-            supportRefs.resize(particles.size_local_orig());
-
-        localEps.resize(particles.size_local_orig());
-        localEpsInvPow.resize(particles.size_local_orig());
+        if (!isSharedSupport) {
+            subsetKeyPid.resize(particles.size_local_orig());
+            supportRefs.resize(particles.size_local());
+        }
+        localEps.resize(particles.size_local());
+        localEpsInvPow.resize(particles.size_local());
 
 std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
         auto it = particles.getDomainIterator();
 
         if (opt==support_options::RADIUS) {
-            while (it.isNext()) {
-                auto key_o = particles.getOriginKey(it.get());
-
-                if (!isSharedSupport)
+            if (!isSharedSupport) {
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
                     supportRefs.get(key_o.getKey()) = key_o.getKey();
-                ++it;
-            }
-
-            SupportBuilderGPU<vector_type> supportBuilder(particles, differentialSignature, rCut);
-            supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
+                    ++it;
+                }
 
+                SupportBuilderGPU<vector_type> supportBuilder(particles, rCut);
+                supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
+            }
         } else {
-            size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
-            SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
-
-            kerOffsets.resize(supportRefs.size()+1);
-            // need to resize supportKeys1D to yet unknown supportKeysTotalN
-            // add() takes too long
-            openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+            if (!isSharedSupport){
+                openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
+                size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
+                // need to resize supportKeys1D to yet unknown supportKeysTotalN
+                // add() takes too long
+                SupportBuilder<vector_type> supportBuilder(particles, differentialSignature, rCut);
+                kerOffsets.resize(supportRefs.size()+1);
 
-            while (it.isNext()) {
-                if (!isSharedSupport){
-                    auto key_o = particles.getOriginKey(it.get());
+                while (it.isNext()) {
+                    auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
 
                     Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
-                    supportRefs.get(key_o.getKey()) = support.getReferencePointKey();
+                    supportRefs.get(key_o.getKey()) = key_o.getKey();
                     tempSupportKeys.get(key_o.getKey()) = support.getKeys();
                     kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
 
                     if (maxSupportSize < support.size()) maxSupportSize = support.size();
                     supportKeysTotalN += support.size();
+                    ++it;
                 }
-                ++it;
-            }
 
-            if (!isSharedSupport){
                 kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
                 supportKeys1D.resize(supportKeysTotalN);
 
@@ -793,9 +787,7 @@ std::cout << "Support building took " << time_span2.count() * 1000. << " millise
         size_t numThreads = numSMs*numSMsMult*256;
         std::cout << "numThreads " << numThreads << " numMatrices " << numMatrices << std::endl;
 
-        openfpm::vector_custd<T> EMat(numThreads * maxSupportSize * dim);
-        openfpm::vector_custd<T> VMat(numThreads * maxSupportSize * monomialBasisSize);
-        // B has the same dimensions as V
+        // B is an intermediate matrix
         openfpm::vector_custd<T> BMat(numThreads * maxSupportSize * monomialBasisSize);
         // allocate device space for A, b
         openfpm::vector_custd<T> AMat(numMatrices*monomialBasisSize*monomialBasisSize);
@@ -826,7 +818,7 @@ std::cout << "Support building took " << time_span2.count() * 1000. << " millise
         auto bVecPointersKernel = bVecPointers.toKernel(); T** bVecPointersKernelPointer = (T**) bVecPointersKernel.getPointer();
 
         assembleLocalMatrices_gpu<<<numSMsMult*numSMs, 256>>>(particles.toKernel(), differentialSignature, differentialOrder, monomialBasisKernel, supportRefs.toKernel(), kerOffsets.toKernel(), supportKeys1D.toKernel(),
-            AMatPointersKernelPointer, bVecPointersKernelPointer, localEps.toKernel(), localEpsInvPow.toKernel(), EMat.toKernel(), VMat.toKernel(), BMat.toKernel(), numMatrices, maxSupportSize);
+            AMatPointersKernelPointer, bVecPointersKernelPointer, localEps.toKernel(), localEpsInvPow.toKernel(), BMat.toKernel(), numMatrices, maxSupportSize);
 
         localEps.template deviceToHost();
         localEpsInvPow.template deviceToHost();
@@ -877,9 +869,8 @@ std::cout << "Support building took " << time_span2.count() * 1000. << " millise
     }
 
     T computeKernel(Point<dim, T> x, EMatrix<T, Eigen::Dynamic, 1> & a) const {
-        T res = 0;
         unsigned int counter = 0;
-        T expFactor = exp(-norm2(x));
+        T res = 0, expFactor = exp(-norm2(x));
 
         size_t N = monomialBasis.getElements().size();
         for (size_t i = 0; i < N; ++i)
@@ -898,9 +889,8 @@ std::cout << "Support building took " << time_span2.count() * 1000. << " millise
     // template <unsigned int a_dim>
     // T computeKernel(Point<dim, T> x, const T (& a) [a_dim]) const {
     T computeKernel(Point<dim, T> x, const T* a) const {
-        T res = 0;
         unsigned int counter = 0;
-        T expFactor = exp(-norm2(x));
+        T res = 0, expFactor = exp(-norm2(x));
 
         size_t N = monomialBasis.getElements().size();
         for (size_t i = 0; i < N; ++i)
@@ -938,21 +928,13 @@ template<unsigned int dim, typename T, typename particles_type, typename monomia
 __global__ void assembleLocalMatrices_gpu(
         particles_type particles, Point<dim, unsigned int> differentialSignature, unsigned int differentialOrder, monomialBasis_type monomialBasis, 
         supportKey_type supportRefs, supportKey_type kerOffsets, supportKey_type supportKeys1D, T** h_A, T** h_b, localEps_type localEps, localEps_type localEpsInvPow,
-        matrix_type EMat, matrix_type VMat, matrix_type BMat, size_t numMatrices, size_t maxSupportSize)
+        matrix_type BMat, size_t numMatrices, size_t maxSupportSize)
     {
     auto p_key = GET_PARTICLE(particles);
     size_t monomialBasisSize = monomialBasis.size();
-
-    size_t EStartPos = maxSupportSize * dim * p_key;
-    size_t VStartPos = maxSupportSize * monomialBasisSize * p_key;
-    size_t BStartPos = maxSupportSize * monomialBasisSize * p_key;
-
-    T* V = &((T*)VMat.getPointer())[VStartPos];
-    T* E = &((T*)EMat.getPointer())[EStartPos];
-    T* B = &((T*)BMat.getPointer())[BStartPos];
-
-    DcpseDiagonalScalingMatrix<dim, MonomialBasis<dim, aggregate<Monomial_gpu<dim>>, openfpm::vector_custd_ker, memory_traits_inte>> diagonalScalingMatrix(monomialBasis);
-    DcpseRhs_gpu<dim, MonomialBasis<dim, aggregate<Monomial_gpu<dim>>, openfpm::vector_custd_ker, memory_traits_inte>> rhs(monomialBasis, differentialSignature);
+    size_t BStartPos = maxSupportSize * monomialBasisSize * p_key; T* B = &((T*)BMat.getPointer())[BStartPos];
+    const auto& basisElements = monomialBasis.getElements();
+    int rhsSign = (Monomial_gpu<dim>(differentialSignature).order() % 2 == 0) ? 1 : -1;
 
     for (; 
         p_key < numMatrices; 
@@ -964,22 +946,33 @@ __global__ void assembleLocalMatrices_gpu(
         size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
         size_t  xpK = supportRefs.get(p_key);
 
-        // Vandermonde matrix computation
-        // Pointer to E is passed to reuse memory for offset construction inside Vandermonde. 
-        Vandermonde_gpu<dim, T, MonomialBasis<dim, aggregate<Monomial_gpu<dim>>, openfpm::vector_custd_ker, memory_traits_inte>>
-                vandermonde(E, xpK, supportKeysSize, supportKeys, monomialBasis, particles);
-        vandermonde.getMatrix(V);
+    assert(supportKeysSize >= monomialBasis.size());
 
-        T eps = vandermonde.getEps(); localEps.get(p_key) = eps;
-        localEpsInvPow.get(p_key) = 1.0 / pow(eps,differentialOrder);
+        T FACTOR = 2, avgNeighbourSpacing = 0;
+        for (int i = 0 ; i < supportKeysSize; i++) {
+            Point<dim,T> off = xa; off -= particles.getPosOrig(supportKeys[i]);
+            for (size_t j = 0; j < dim; ++j)
+                avgNeighbourSpacing += fabs(off.value(j));
+        }
+
+        avgNeighbourSpacing /= supportKeysSize;
+        T eps = FACTOR * avgNeighbourSpacing;
 
-        diagonalScalingMatrix.buildMatrix(E, xpK, supportKeysSize, supportKeys, eps, particles);
+    assert(eps != 0);
+
+        localEps.get(p_key) = eps;
+        localEpsInvPow.get(p_key) = 1.0 / pow(eps,differentialOrder);
 
         // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
         for (int i = 0; i < supportKeysSize; ++i)
-            for (int j = 0; j < monomialBasisSize; ++j)
-                // E is a diagonal matrix
-                B[i*monomialBasisSize+j] = E[i] * V[i*monomialBasisSize+j];
+            for (int j = 0; j < monomialBasisSize; ++j) {
+                Point<dim,T> off = xa; off -= particles.getPosOrig(supportKeys[i]);
+                const Monomial_gpu<dim>& m = basisElements.get(j);
+
+                T V_ij = m.evaluate(off) / pow(eps, m.order());
+                T E_ii = exp(- norm2(off) / (2.0 * eps * eps));
+                B[i*monomialBasisSize+j] = E_ii * V_ij;
+            }
 
         T sum = 0.0;
         // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> A = B.transpose() * B;
@@ -992,7 +985,10 @@ __global__ void assembleLocalMatrices_gpu(
             }
 
         // Compute RHS vector b
-        rhs.template getVector<T>(h_b[p_key]);
+        for (size_t i = 0; i < monomialBasisSize; ++i) {
+            const Monomial_gpu<dim>& dm = basisElements.get(i).getDerivative(differentialSignature);
+            h_b[p_key][i] = rhsSign * dm.evaluate(Point<dim, T>(0));
+        }
     }
 }
 
@@ -1004,43 +1000,32 @@ __global__ void calcKernels_gpu(particles_type particles, monomialBasis_type mon
     Point<dim, T> xa = particles.getPos(p_key);
 
     size_t  monomialBasisSize = monomialBasis.size();
+    const auto& basisElements = monomialBasis.getElements();
     size_t  supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
     size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
+
     T* calcKernelsLocal = &((T*)calcKernels.getPointer())[kerOffsets.get(p_key)];
     T eps = localEps.get(p_key);
 
     for (size_t j = 0; j < supportKeysSize; ++j)
     {
         size_t xqK = supportKeys[j];
-        Point<dim, T> xq = particles.getPos(xqK);
-        Point<dim, T> normalizedArg = (xa - xq) / eps;
+        Point<dim, T> xq = particles.getPosOrig(xqK);
+        Point<dim, T> offNorm = (xa - xq) / eps;
+        T expFactor = exp(-norm2(offNorm));
 
-        calcKernelsLocal[j] = computeKernel_gpu(normalizedArg, h_b[p_key], monomialBasis);
-    }
-}
-
-template <unsigned int dim, typename T, typename monomialBasis_type>
-__device__ T computeKernel_gpu(Point<dim, T>& x, const T* a, const monomialBasis_type& monomialBasis) {
-    T res = 0;
-    unsigned int counter = 0;
-    T expFactor = exp(-norm2(x));
-
-    const auto& basisElements = monomialBasis.getElements();
-
-    size_t N = basisElements.size();
-    for (size_t i = 0; i < N; ++i)
-    {
-        const Monomial_gpu<dim> &m = basisElements.get(i);
+        T res = 0;
+        for (size_t i = 0; i < monomialBasisSize; ++i) {
+            const Monomial_gpu<dim> &m = basisElements.get(i);
+            T mbValue = m.evaluate(offNorm);
+            T coeff = h_b[p_key][i];
 
-        T coeff = a[counter];
-        T mbValue = m.evaluate(x);
-        res += coeff * mbValue * expFactor;
-        ++counter;
+            res += coeff * mbValue * expFactor;
+        }
+        calcKernelsLocal[j] = res;
     }
-    return res;
 }
 
-
 #endif
 #endif //OPENFPM_PDATA_DCPSE_CUH
 
diff --git a/src/DCPSE/DcpseDiagonalScalingMatrix.hpp b/src/DCPSE/DcpseDiagonalScalingMatrix.hpp
index afa4a13a43861065d5c911266d91dfc24738ef6d..eefe15b091cfb8f869168410bfe933cb38afdf5b 100644
--- a/src/DCPSE/DcpseDiagonalScalingMatrix.hpp
+++ b/src/DCPSE/DcpseDiagonalScalingMatrix.hpp
@@ -17,7 +17,7 @@ private:
     const monomialBasis_type& monomialBasis;
 
 public:
-    __host__ __device__ DcpseDiagonalScalingMatrix(const monomialBasis_type &monomialBasis) : monomialBasis(monomialBasis) {}
+    DcpseDiagonalScalingMatrix(const monomialBasis_type &monomialBasis) : monomialBasis(monomialBasis) {}
 
     template <typename T, typename MatrixType, typename vector_type, typename vector_type2>
     void buildMatrix(MatrixType &M, Support support, T eps, vector_type & particlesFrom , vector_type2 & particlesTo)
diff --git a/src/DCPSE/DcpseRhs.cuh b/src/DCPSE/DcpseRhs.cuh
deleted file mode 100644
index 68dbdb93098951246ca6ca4bc7d4de2c511cf749..0000000000000000000000000000000000000000
--- a/src/DCPSE/DcpseRhs.cuh
+++ /dev/null
@@ -1,49 +0,0 @@
-//
-// Created by Serhii
-//
-
-#ifndef OPENFPM_PDATA_DCPSERHS_CUH
-#define OPENFPM_PDATA_DCPSERHS_CUH
-
-#include "MonomialBasis.hpp"
-
-
-template<unsigned int dim, typename MonomialBasis_type>
-class DcpseRhs_gpu
-{
-private:
-    const Point<dim, unsigned int>& differentialSignature;
-    const MonomialBasis_type& monomialBasis;
-    int sign;
-public:
-    __host__ __device__ DcpseRhs_gpu(const MonomialBasis_type &monomialBasis, const Point<dim, unsigned int> &differentialSignature);
-
-    template<typename T>
-    __host__ __device__ void getVector(T* b);
-};
-
-
-template<unsigned int dim, typename MonomialBasis_type>
-__host__ __device__ DcpseRhs_gpu<dim, MonomialBasis_type>::DcpseRhs_gpu(const MonomialBasis_type &monomialBasis,
-                        const Point<dim, unsigned int> &differentialSignature)
-        : monomialBasis(monomialBasis), differentialSignature(differentialSignature) {
-    unsigned int order = (Monomial_gpu<dim>(differentialSignature)).order();
-    if (order % 2 == 0)
-        sign = 1;
-    else
-        sign = -1;
-}
-
-template<unsigned int dim, typename MonomialBasis_type>
-template<typename T>
-__host__ __device__ void DcpseRhs_gpu<dim, MonomialBasis_type>::getVector(T* b)
-{
-    const auto& basisElements = monomialBasis.getElements();
-
-    for (size_t i = 0; i < basisElements.size(); ++i) {
-        const Monomial_gpu<dim>& dm = basisElements.get(i).getDerivative(differentialSignature);
-        b[i] = sign * dm.evaluate(Point<dim, T>(0));
-    }
-}
-
-#endif //OPENFPM_PDATA_DCPSERHS_CUH
diff --git a/src/DCPSE/SupportBuilder.cuh b/src/DCPSE/SupportBuilder.cuh
index 19d007658d44b92af0c8c6cd1581c375b37c8ed0..363741b7bb7b0d2c49fd4b0305198263ffb44e1b 100644
--- a/src/DCPSE/SupportBuilder.cuh
+++ b/src/DCPSE/SupportBuilder.cuh
@@ -53,7 +53,7 @@ __global__ void gatherSupportSize_gpu(
             size_t el = cl.get(cellLinId, k);
 
             if (p_key == el) continue;
-            if (pos.distance(particles.getPos(el)) < rCut) ++N;
+            if (pos.distance(particles.getPosOrig(el)) < rCut) ++N;
         }
 
     } while (nextCell<dim>(offset, 2+1));
@@ -90,7 +90,7 @@ __global__ void assembleSupport_gpu(particles_type particles, CellList_type cl,
             size_t el = cl.get(cellLinId, k);
 
             if (p_key == el) continue;
-            if (pos.distance(particles.getPos(el)) < rCut) supportKeys[N++] = el;
+            if (pos.distance(particles.getPosOrig(el)) < rCut) supportKeys[N++] = el;
         }
 
     } while (nextCell<dim>(offset, 2+1));
@@ -103,22 +103,22 @@ class SupportBuilderGPU
 {
 private:
     vector_type &domain;
-    const Point<vector_type::dims, unsigned int> differentialSignature;
     typename vector_type::stype rCut;
 
 public:
-    SupportBuilderGPU(vector_type &domain, Point<vector_type::dims, unsigned int> differentialSignature, typename vector_type::stype rCut)
-        : domain(domain), differentialSignature(differentialSignature), rCut(rCut) {}
-
-    SupportBuilderGPU(vector_type &domain, unsigned int differentialSignature[vector_type::dims], typename vector_type::stype rCut) 
-        : SupportBuilderGPU(domain, Point<vector_type::dims, unsigned int>(differentialSignature), rCut) {}
+    SupportBuilderGPU(vector_type &domain, typename vector_type::stype rCut)
+        : domain(domain), rCut(rCut) {}
 
     void getSupport(size_t N, openfpm::vector_custd<size_t>& kerOffsets, openfpm::vector_custd<size_t>& supportKeys1D, 
         size_t& maxSupport, size_t& supportKeysTotalN)
     {
         domain.hostToDevicePos();
         auto it = domain.getDomainIteratorGPU(512);
-        auto NN = domain.getCellListGPU(rCut);
+        typedef CellList_gen<vector_type::dims, typename vector_type::stype, Process_keys_lin, Mem_fast<CudaMemory>, shift<vector_type::dims, typename vector_type::stype>> params;
+        // auto NN = domain.getCellListGPU(rCut);
+        auto NN = domain.template getCellList<params>(rCut);
+        NN.hostToDevice();
+
 
         // +1 to allow getting size from cumulative sum: "size[i+1] - size[i]"
         kerOffsets.resize(N+1);
diff --git a/src/DCPSE/Vandermonde.cuh b/src/DCPSE/Vandermonde.cuh
deleted file mode 100644
index fed9a8f602b1d90f13a3b4996839fdbbed35bd02..0000000000000000000000000000000000000000
--- a/src/DCPSE/Vandermonde.cuh
+++ /dev/null
@@ -1,119 +0,0 @@
-//
-// Created by Serhii on 15/05/21
-// 
-
-
-#ifndef OPENFPM_PDATA_VANDERMONDE_CUH
-#define OPENFPM_PDATA_VANDERMONDE_CUH
-
-#include "MonomialBasis.hpp"
-#include "VandermondeRowBuilder.hpp"
-#include "Support.hpp"
-
-template<unsigned int dim, typename T, typename MonomialBasis_type = MonomialBasis<dim>>
-class Vandermonde_gpu
-{
-private:
-    size_t N;
-    T* offsets;
-    const Point<dim, T> point;
-    const MonomialBasis_type& monomialBasis;
-    T eps;
-
-public:
-    template<typename vector_type>
-    __host__ __device__ Vandermonde_gpu( T* mem, size_t supportRefKey, size_t supportKeysSize,
-        const size_t* supportKeys, const MonomialBasis_type &monomialBasis, const vector_type & particles)
-            : offsets(mem), monomialBasis(monomialBasis), point(particles.getPos(supportRefKey))
-    {
-        initialize(supportRefKey, supportKeysSize, supportKeys, particles);
-    }
-
-    __host__ __device__ void getMatrix(T *M)
-    {
-        // Build the Vandermonde_gpu matrix, row-by-row
-        VandermondeRowBuilder<dim, T, MonomialBasis_type> vrb(monomialBasis);
-        unsigned int row = 0;
-
-        for (size_t i = 0; i < N; ++i)
-        {
-            const T(*offset) [dim] = reinterpret_cast<const T(*) [dim]>(&offsets[i*dim]);
-            vrb.buildRow_gpu(&M[i*monomialBasis.size()], row, *offset, eps);
-            ++row;
-        }
-    }
-
-    __host__ __device__ T getEps()
-    {
-        return eps;
-    }
-
-
-    __host__ __device__ ~Vandermonde_gpu()
-    {
-    }
-
-private:
-
-
-    __host__ __device__ void computeEps(T factor)
-    {
-        T avgNeighbourSpacing = 0;
-        for (size_t i = 0; i < N; ++i)
-        {
-            const T(*offset) [dim] = reinterpret_cast<const T(*) [dim]>(&offsets[i*dim]);
-            avgNeighbourSpacing += computeAbsSum(*offset);
-
-        }
-        avgNeighbourSpacing /= N;
-        eps = factor * avgNeighbourSpacing;
-        assert(eps != 0);
-    }
-
-    __host__ __device__ static T computeAbsSum(const Point<dim, T> &x)
-    {
-        T absSum = 0;
-        for (unsigned int i = 0; i < dim; ++i)
-        {
-            absSum += fabs(x.value(i));
-        }
-        return absSum;
-    }
-
-    __host__ __device__ static T computeAbsSum(const T (& x) [dim])
-    {
-        T absSum = 0;
-        for (unsigned int i = 0; i < dim; ++i)
-        {
-            absSum += fabs(x[i]);
-        }
-        return absSum;
-    }
-
-
-    template<typename vector_type>
-    __host__ __device__ void initialize(size_t supportRefKey, size_t supportKeysSize, const size_t* supportKeys, const vector_type & particles)
-    {
-        N = supportKeysSize;
-
-        for (int i = 0 ; i < N ; i++)
-        {
-            size_t otherKey = supportKeys[i];
-            Point<dim,T> p = particles.getPos(supportRefKey);
-            const auto& p2 = particles.getPos(otherKey);
-
-            p -= p2;
-
-            for (size_t j = 0; j < dim; j++) {
-                offsets[i*dim+j] = p[j];
-                auto diff = p[j];
-            }
-        }
-
-        computeEps(2);
-    }
-
-};
-
-
-#endif //OPENFPM_PDATA_VANDERMONDE_CUH
diff --git a/src/DCPSE/VandermondeRowBuilder.hpp b/src/DCPSE/VandermondeRowBuilder.hpp
index c4da47fcd22d6392393ae6f77d87806c244981ea..2de39fa06e9a9e8e68d54c438350865dd206af46 100644
--- a/src/DCPSE/VandermondeRowBuilder.hpp
+++ b/src/DCPSE/VandermondeRowBuilder.hpp
@@ -15,12 +15,10 @@ private:
     const MonomialBasis_type& monomialBasis;
 
 public:
-    __host__ __device__ VandermondeRowBuilder(const MonomialBasis_type &monomialBasis) : monomialBasis(monomialBasis) {}
+    VandermondeRowBuilder(const MonomialBasis_type &monomialBasis) : monomialBasis(monomialBasis) {}
 
     template <typename MatrixType>
     void buildRow(MatrixType &M, unsigned int row, Point<dim, T> x, T eps);
-
-    __host__ __device__ void buildRow_gpu(T *M, unsigned int row, const T (& x) [dim], T eps);
 };
 
 template<unsigned int dim, typename T,typename MonomialBasis_type>
@@ -37,18 +35,5 @@ void VandermondeRowBuilder<dim, T, MonomialBasis_type>::buildRow(MatrixType &M,
     }
 }
 
-template<unsigned int dim, typename T, typename MonomialBasis_type>
-__host__ __device__ void VandermondeRowBuilder<dim, T, MonomialBasis_type>::buildRow_gpu(T* M, unsigned int row, const T (& x) [dim], T eps)
-{
-    // M is a pointer to the row being filled
-    const auto& basisElements = monomialBasis.getElements();
-
-    for (size_t col = 0; col < basisElements.size(); ++col)
-    {
-        const Monomial_gpu<dim>& m = basisElements.get(col);
-        M[col] = m.evaluate(x) / pow(eps, m.order());
-    }
-}
-
 
 #endif //OPENFPM_PDATA_VANDERMONDEROW_HPP
diff --git a/src/Operators/Vector/vector_dist_operators.hpp b/src/Operators/Vector/vector_dist_operators.hpp
index 0eb79d6af216fa33036970269397106b55288916..7c3519222e74447142b2a49230d96de0fa3e0abc 100644
--- a/src/Operators/Vector/vector_dist_operators.hpp
+++ b/src/Operators/Vector/vector_dist_operators.hpp
@@ -9,6 +9,7 @@
 #define OPENFPM_NUMERICS_SRC_OPERATORS_VECTOR_VECTOR_DIST_OPERATORS_HPP_
 
 #include "Vector/vector_dist.hpp"
+#include "Vector/vector_dist_subset.hpp"
 #include "lib/pdata.hpp"
 #include "cuda/vector_dist_operators_cuda.cuh"
 
@@ -980,7 +981,7 @@ public:
 	 * \return the result of the expression
 	 *
 	 */
-	__device__ __host__ inline auto value(const unsigned int & k) const -> decltype(pos_or_propR<vector,prp>::value(v.v,k))
+	__host__ __device__ inline auto value(const unsigned int & k) const -> decltype(pos_or_propR<vector,prp>::value(v.v,k))
 	{
 		return pos_or_propR<vector,prp>::value(v.v,k);
 	}
@@ -992,7 +993,7 @@ public:
 	 * \return the result of the expression
 	 *
 	 */
-	__device__ __host__ inline auto value(const unsigned int & k) -> decltype(pos_or_propR<vector,prp>::value(v.v,k))
+	__host__ __device__  inline auto value(const unsigned int & k) -> decltype(pos_or_propR<vector,prp>::value(v.v,k))
 	{
 		return pos_or_propR<vector,prp>::value(v.v,k);
 	}