diff --git a/openfpm_data b/openfpm_data
index 6f81e335b5de00f03ad517e056925ea4ec52392f..9fa3b84aa906f694ccc410f6a8536ba77a6437be 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 6f81e335b5de00f03ad517e056925ea4ec52392f
+Subproject commit 9fa3b84aa906f694ccc410f6a8536ba77a6437be
diff --git a/openfpm_vcluster b/openfpm_vcluster
index a99918127f5835c31d2df4e9020efdeb46d07d66..faa1d114c2d13e562d200c92e98c1ed7be306eeb 160000
--- a/openfpm_vcluster
+++ b/openfpm_vcluster
@@ -1 +1 @@
-Subproject commit a99918127f5835c31d2df4e9020efdeb46d07d66
+Subproject commit faa1d114c2d13e562d200c92e98c1ed7be306eeb
diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp
index fd5155bcb3a9d2e6f373758224992dd8eb8259e7..6135c43db9c9e1cbc251ad8df2fe1d37f9c036af 100644
--- a/src/Vector/vector_dist.hpp
+++ b/src/Vector/vector_dist.hpp
@@ -130,11 +130,17 @@ struct gcl<dim,St,CellList_gen<dim, St, Process_keys_hilb,Mem_type, shift<dim, S
  * \tparam prop properties the vector element store in OpenFPM data structure format
  * \tparam Decomposition Decomposition strategy to use CartDecomposition ...
  * \tparam Memory Memory pool where store the information HeapMemory ...
+ * \tparam Memory layout
  *
  */
 
-template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St>, typename Memory = HeapMemory>
-class vector_dist : public vector_dist_comm<dim,St,prop,Decomposition,Memory>
+template<unsigned int dim,
+         typename St,
+         typename prop,
+         typename Decomposition = CartDecomposition<dim,St>,
+         typename Memory = HeapMemory,
+         template<typename> class layout_base = memory_traits_lin>
+class vector_dist : public vector_dist_comm<dim,St,prop,Decomposition,Memory,layout_base>
 {
 public:
 
@@ -151,11 +157,11 @@ private:
 
 	//! Particle position vector, (It has 2 elements) the first has real particles assigned to a processor
 	//! the second element contain unassigned particles
-	openfpm::vector<Point<dim, St>> v_pos;
+	openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos;
 
 	//! Particle properties vector, (It has 2 elements) the first has real particles assigned to a processor
 	//! the second element contain unassigned particles
-	openfpm::vector<prop> v_prp;
+	openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp;
 
 	//! Virtual cluster
 	Vcluster & v_cl;
@@ -2115,4 +2121,6 @@ public:
 };
 
 
+template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St>> using vector_dist_gpu = vector_dist<dim,St,prop,Decomposition,CudaMemory,memory_traits_inte>;
+
 #endif /* VECTOR_HPP_ */
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index 758f51c070f79820ee183fc4cc42f760f0df9d34..b6acc16b43d927fdefe64581451f9dea404dc655 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -42,7 +42,12 @@ inline static size_t compute_options(size_t opt)
  *
  */
 
-template<unsigned int dim, typename St, typename prop, typename Decomposition = CartDecomposition<dim,St>, typename Memory = HeapMemory>
+template<unsigned int dim,
+         typename St,
+         typename prop,
+         typename Decomposition = CartDecomposition<dim,St>,
+         typename Memory = HeapMemory,
+         template<typename> class layout_base = memory_traits_lin>
 class vector_dist_comm
 {
 	//! Number of units for each sub-domain
@@ -578,7 +583,11 @@ class vector_dist_comm
 	 * \param m_prp sending buffer for properties
 	 *
 	 */
-	void fill_send_map_buf(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, openfpm::vector<size_t> & prc_sz_r, openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos, openfpm::vector<openfpm::vector<prop>> & m_prp)
+	void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
+			               openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
+			               openfpm::vector<size_t> & prc_sz_r,
+			               openfpm::vector<openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>> & m_pos,
+			               openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>> & m_prp)
 	{
 		m_prp.resize(prc_sz_r.size());
 		m_pos.resize(prc_sz_r.size());
@@ -658,7 +667,9 @@ class vector_dist_comm
 	 * \param prc_sz For each processor the number of particles to send
 	 *
 	 */
-	template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<aggregate<size_t,size_t,size_t>> & lbl_p, openfpm::vector<size_t> & prc_sz)
+	template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
+			                                           openfpm::vector<aggregate<size_t,size_t,size_t>> & lbl_p,
+			                                           openfpm::vector<size_t> & prc_sz)
 	{
 		// reset lbl_p
 		lbl_p.clear();
@@ -969,10 +980,10 @@ public:
                 {
                 	size_t opt_ = compute_options(opt);
                 	op_ssend_gg_recv_merge opm(g_m);
-                    v_cl.SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
+                    v_cl.SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
                 }
                 else
-                	v_cl.SSendRecvP<send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte);
+                	v_cl.SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte);
 
                 // fill g_opart_sz
                 g_opart_sz.resize(prc_g_opart.size());
@@ -1076,7 +1087,7 @@ public:
 		fill_send_map_buf_list<prp_object,prp...>(v_pos,v_prp,prc_sz_r, m_pos, m_prp);
 
 		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map);
-		v_cl.SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map);
+		v_cl.SSendRecvP<openfpm::vector<prp_object>,decltype(v_prp),layout_base,prp...>(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map);
 
 		// mark the ghost part
 
@@ -1096,7 +1107,9 @@ public:
 	 * \param g_m ghost marker
 	 *
 	 */
-	template<typename obp = KillParticle> void map_(openfpm::vector<Point<dim, St>> & v_pos, openfpm::vector<prop> & v_prp, size_t & g_m)
+	template<typename obp = KillParticle>
+	void map_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
+			  openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, size_t & g_m)
 	{
 		// Processor communication size
 		openfpm::vector<size_t> prc_sz(v_cl.getProcessingUnits());
@@ -1125,14 +1138,21 @@ public:
 		}
 
 		//! position vector
-		openfpm::vector<openfpm::vector<Point<dim, St>>> m_pos;
+		openfpm::vector<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>> m_pos;
 		//! properties vector
-		openfpm::vector<openfpm::vector<prop>> m_prp;
+		openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>> m_prp;
 
 		fill_send_map_buf(v_pos,v_prp, prc_sz_r, m_pos, m_prp);
 
-		v_cl.SSendRecv(m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map);
-		v_cl.SSendRecv(m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map);
+		v_cl.SSendRecv<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
+					   openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>,
+					   layout_base>
+					   (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map);
+
+		v_cl.SSendRecv<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>,
+					   openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>,
+					   layout_base>
+					   (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map);
 
 		// mark the ghost part
 
@@ -1215,12 +1235,12 @@ public:
 			size_t opt_ = compute_options(opt);
 
 			op_ssend_recv_merge<op> opm(g_opart);
-			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_g_opart,g_opart_sz,opt_);
+			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_g_opart,g_opart_sz,opt_);
 		}
 		else
 		{
 			op_ssend_recv_merge<op> opm(g_opart);
-			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_recv_put,recv_sz_put);
+			v_cl.SSendRecvP_op<op_ssend_recv_merge<op>,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_recv_get,opm,prc_recv_put,recv_sz_put);
 		}
 
 		// process also the local replicated particles
diff --git a/src/Vector/vector_dist_gpu_unit_tests.cpp b/src/Vector/vector_dist_gpu_unit_tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3853ec1cd9e574fb1ed448b381df0af8e6ba02d9
--- /dev/null
+++ b/src/Vector/vector_dist_gpu_unit_tests.cpp
@@ -0,0 +1,43 @@
+
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include "VCluster/VCluster.hpp"
+#include <Vector/vector_dist.hpp>
+
+BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
+
+void print_test(std::string test, size_t sz)
+{
+	if (create_vcluster().getProcessUnitID() == 0)
+		std::cout << test << " " << sz << "\n";
+}
+
+BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
+{
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	// set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
+	Ghost<3,float> g(0.01);
+
+	// Boundary conditions
+	size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+
+	vector_dist_gpu<3,float,aggregate<float,float[3]>> vd(1000,domain,bc,g);
+
+	auto it = vd.getDomainIterator();
+
+	while (it.isNext())
+	{
+		auto p = it.get();
+
+		vd.getPos(p)[0] = (float)rand() / RAND_MAX;
+		vd.getPos(p)[1] = (float)rand() / RAND_MAX;
+		vd.getPos(p)[2] = (float)rand() / RAND_MAX;
+
+		++it;
+	}
+
+	vd.map();
+}
+
+BOOST_AUTO_TEST_SUITE_END()