diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu
index 55ad9a0517b87ecfb372da57c8b7febdac72470e..2d19dd2a1f2f33488c66fb98eee03029648d8d56 100644
--- a/src/NN/CellList/CellList_gpu_test.cu
+++ b/src/NN/CellList/CellList_gpu_test.cu
@@ -83,17 +83,16 @@ void test_sub_index()
 
 	no_transform_only<dim,T> t(mt,pt);
 
+
 	CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
 																	spacing,
 																	off,
 																	t,
-																	pl.capacity(),
 																	pl.size(),
-																	part_ids.capacity(),
 																	0,
-																	static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																	pl.toKernel(),
+																	cl_n.toKernel(),
+																	part_ids.toKernel());
 
 	cl_n.template deviceToHost<0>();
 	part_ids.template deviceToHost<0>();
@@ -203,17 +202,16 @@ void test_sub_index2()
 
 	shift_only<dim,T> t(mt,pt);
 
+
 	CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
 																	spacing,
 																	off,
 																	t,
-																	pl.capacity(),
 																	pl.size(),
-																	part_ids.capacity(),
 																	0,
-																	static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																	pl.toKernel(),
+																	cl_n.toKernel(),
+																	part_ids.toKernel());
 
 	cl_n.template deviceToHost<0>();
 	part_ids.template deviceToHost<0>();
@@ -387,11 +385,10 @@ void test_fill_cell()
 																				   div_c,
 																				   off,
 																				   part_ids.size(),
-																				   part_ids.capacity(),
 																				   0,
-																				   static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
-																				   static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
-																				   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
+																				   starts.toKernel(),
+																				   part_ids.toKernel(),
+																				   cells.toKernel() );
 
 	cells.template deviceToHost<0>();
 
@@ -666,7 +663,8 @@ void test_reorder_parts(size_t n_part)
 	// Here we test fill cell
 	CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
 			      decltype(pl.toKernel()),
-			      decltype(sort_to_not_sort.toKernel()),
+				  decltype(sort_to_not_sort.toKernel()),
+				  decltype(cells_out.toKernel()),
 			      cnt_type,
 			      shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
 			                                                  parts_prp.toKernel(),
@@ -675,7 +673,7 @@ void test_reorder_parts(size_t n_part)
 			                                                  pl_out.toKernel(),
 			                                                  sort_to_not_sort.toKernel(),
 			                                                  non_sort_to_sort.toKernel(),
-			                                                  static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
+			                                                  cells_out.toKernel());
 
 	bool check = true;
 	parts_prp_out.template deviceToHost<0>();
diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp
index defee388cdefe38e20acf4e8a7397863c6ff2b70..5ee0e2b043520674e05d0e7a35456865dc707a29 100644
--- a/src/NN/CellList/cuda/CellList_gpu.hpp
+++ b/src/NN/CellList/cuda/CellList_gpu.hpp
@@ -261,13 +261,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																		spacing_c,
 																		off,
 																		this->getTransform(),
-																		pl.capacity(),
 																		pl.size(),
-																		part_ids.capacity(),
 																		start,
-																		static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																		pl.toKernel(),
+																		starts.toKernel(),
+																		part_ids.toKernel());
 
 		// now we construct the cells
 
@@ -306,6 +304,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 		CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 				      decltype(pl.toKernel()),
 				      decltype(sorted_to_not_sorted.toKernel()),
+					  decltype(cells.toKernel()),
 				      cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
 				                                                           pl_prp.toKernel(),
 				                                                           pl_prp_out.toKernel(),
@@ -313,7 +312,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 				                                                           pl_out.toKernel(),
 				                                                           sorted_to_not_sorted.toKernel(),
 				                                                           non_sorted_to_sorted.toKernel(),
-				                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
+				                                                           cells.toKernel());
 
 		if (opt == cl_construct_opt::Full)
 		{
@@ -391,13 +390,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																		spacing_c,
 																		off,
 																		this->getTransform(),
-																		pl.capacity(),
 																		stop,
-																		part_ids.capacity(),
 																		start,
-																		static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																		pl.toKernel(),
+																		cl_n.toKernel(),
+																		part_ids.toKernel());
 
 		// now we scan
 		starts.resize(cl_n.size());
@@ -413,7 +410,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
                                                                                             part_ids.size(),
-                                                                                            static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
+                                                                                            cells.toKernel()) );
 
                 // sort
 
@@ -422,14 +419,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 #else
 
                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
-                                                                                                                                                                           div_c,
-                                                                                                                                                                           off,
-                                                                                                                                                                           part_ids.size(),
-                                                                                                                                                                           part_ids.capacity(),
-																					   start,
-                                                                                                                                                                           static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
-                                                                                                                                                                           static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
-                                                                                                                                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
+                                                                                    div_c,
+                                                                                    off,
+																					part_ids.size(),
+																					start,
+                                                                                    starts.toKernel(),
+																					part_ids.toKernel(),
+																					cells.toKernel() );
 
 #endif
 
@@ -445,6 +441,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 						  decltype(pl.toKernel()),
 						  decltype(sorted_to_not_sorted.toKernel()),
+						  decltype(cells.toKernel()),
 						  cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
 																			   pl_prp.toKernel(),
 																			   pl_prp_out.toKernel(),
@@ -452,7 +449,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																			   pl_out.toKernel(),
 																			   sorted_to_not_sorted.toKernel(),
 																			   non_sorted_to_sorted.toKernel(),
-																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
+																			   cells.toKernel());
 		}
 		else
 		{
@@ -460,6 +457,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()),
 						  decltype(pl.toKernel()),
 						  decltype(sorted_to_not_sorted.toKernel()),
+						  decltype(cells.toKernel()),
 						  cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(),
 																			   pl_prp.toKernel(),
 																			   pl_prp_out.toKernel(),
@@ -467,7 +465,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																			   pl_out.toKernel(),
 																			   sorted_to_not_sorted.toKernel(),
 																			   non_sorted_to_sorted.toKernel(),
-																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
+																			   cells.toKernel());
 		}
 
 		if (opt == cl_construct_opt::Full)
diff --git a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
index 0d7c86a9fea377d0e6f538c36c46d7db5f3b29eb..e50335e7a2837e4fba52cbdb51fc8243a76341a8 100644
--- a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
+++ b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
@@ -258,18 +258,17 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid)
 #ifdef __NVCC__
 
 template<bool is_sparse,unsigned int dim, typename pos_type,
-         typename cnt_type, typename ids_type, typename transform>
+         typename cnt_type, typename ids_type, typename transform,
+		 typename vector_pos_type, typename vector_cnt_type, typename vector_pids_type>
 __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
 						 openfpm::array<pos_type,dim,cnt_type> spacing,
 						 openfpm::array<ids_type,dim,cnt_type> off,
 						 transform t,
-						 int n_cap,
 						 int n_part,
-						 int n_cap2,
 						 cnt_type start,
-						 pos_type * p_pos,
-						 cnt_type *counts,
-						 cnt_type * p_ids)
+						 vector_pos_type p_pos,
+						 vector_cnt_type counts,
+						 vector_pids_type p_ids)
 {
     cnt_type i, cid, ins;
     ids_type e[dim+1];
@@ -281,23 +280,23 @@ __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
     pos_type p[dim];
 
     for (size_t k = 0 ; k < dim ; k++)
-    {p[k] = p_pos[i+k*n_cap];}
+    {p[k] = p_pos.template get<0>(i)[k];}
 
     cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e);
 
     if (is_sparse == false)
     {
-    	e[dim] = atomicAdd(counts + cid, 1);
+    	e[dim] = atomicAdd(&counts.template get<0>(cid), 1);
 
-    	p_ids[ins] = cid;
-        {p_ids[ins+1*(n_cap2)] = e[dim];}
+    	p_ids.template get<0>(ins)[0] = cid;
+        {p_ids.template get<0>(ins)[1] = e[dim];}
     }
     else
     {
-        p_ids[ins] = cid;
-        {p_ids[ins+1*(n_cap2)] = e[dim];}
+        p_ids.template get<0>(ins)[0] = cid;
+        {p_ids.template get<0>(ins)[1] = e[dim];}
 
-        counts[ins] = cid;
+        counts.template get<0>(ins) = cid;
     }
 }
 
@@ -353,43 +352,42 @@ __global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div
 
 #ifdef MAKE_CELLLIST_DETERMINISTIC
 
-template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
+template<unsigned int dim, typename cnt_type, typename ids_type, typename ph, typename vector_cells_type>
 __global__ void fill_cells(cnt_type phase_id ,
 						   cnt_type n,
-		                   cnt_type *cells)
+		                   vector_cells_type cells)
 {
     cnt_type i;
 
     i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cells[i] = encode_phase_id<cnt_type,ph>(phase_id,i);
+    cells.template get<0>(i) = encode_phase_id<cnt_type,ph>(phase_id,i);
 }
 
 #else
 
-template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
+template<unsigned int dim, typename cnt_type, typename ids_type, typename ph,typename vector_starts_type, typename vector_pids_type, typename vector_cells_type>
 __global__ void fill_cells(cnt_type phase_id ,
 		                   openfpm::array<ids_type,dim,cnt_type> div_c,
 		                   openfpm::array<ids_type,dim,cnt_type> off,
 		                   cnt_type n,
-		                   cnt_type n_cap,
 		                   cnt_type start_p,
-		                   const cnt_type *starts,
-		                   const cnt_type * p_ids,
-		                   cnt_type *cells)
+		                   vector_starts_type starts,
+		                   vector_pids_type p_ids,
+		                   vector_cells_type cells)
 {
     cnt_type i, cid, id, start;
 
     i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cid = p_ids[i];
+    cid = p_ids.template get<0>(i)[0];
 
-    start = starts[cid];
-    id = start + p_ids[1*n_cap+i];
+    start = starts.template get<0>(cid);
+    id = start + p_ids.template get<0>(i)[1];
 
-    cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
+    cells.template get<0>(id) = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
 }
 
 #endif
@@ -424,7 +422,7 @@ __device__ inline void reorder_wprp(const vector_prp & input,
 	output.template set<prp ...>(dst_id,input,src_id);
 }
 
-template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh>
+template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh>
 __global__ void reorder_parts(int n,
 		                      const vector_prp input,
 		                      vector_prp output,
@@ -432,12 +430,12 @@ __global__ void reorder_parts(int n,
 		                      vector_pos output_pos,
 		                      vector_ns sorted_non_sorted,
 		                      vector_ns non_sorted_to_sorted,
-		                      const cnt_type * cells)
+		                      const vector_cells_type cells)
 {
     cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cnt_type code = cells[i];
+    cnt_type code = cells.template get<0>(i);
 
     reorder(input, output, code,i);
     reorder(input_pos,output_pos,code,i);
@@ -446,7 +444,7 @@ __global__ void reorder_parts(int n,
     non_sorted_to_sorted.template get<0>(code) = i;
 }
 
-template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh, unsigned int ... prp>
+template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh, unsigned int ... prp>
 __global__ void reorder_parts_wprp(int n,
 		                      const vector_prp input,
 		                      vector_prp output,
@@ -454,12 +452,12 @@ __global__ void reorder_parts_wprp(int n,
 		                      vector_pos output_pos,
 		                      vector_ns sorted_non_sorted,
 		                      vector_ns non_sorted_to_sorted,
-		                      const cnt_type * cells)
+		                      const vector_cells_type cells)
 {
     cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cnt_type code = cells[i];
+    cnt_type code = cells.template get<0>(i);
     reorder_wprp<vector_prp,cnt_type,prp...>(input, output, code,i);
     reorder(input_pos,output_pos,code,i);
 
diff --git a/src/Vector/vector_subset.hpp b/src/Vector/vector_subset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6d78b69467911a22f3384f8d1d1a097c714a1bd
--- /dev/null
+++ b/src/Vector/vector_subset.hpp
@@ -0,0 +1,67 @@
+#ifndef VECTOR_SUBSET_HPP
+#define VECTOR_SUBSET_HPP
+
+namespace openfpm
+{
+
+template<unsigned int dim,
+         typename prop,
+         template<typename> class layout_base = memory_traits_inte>
+    class vector_subset_ker
+    {
+        mutable openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> v_all;
+
+        mutable openfpm::vector_gpu_ker<aggregate<int>,layout_base> indexes;
+
+        public:
+
+        vector_subset_ker(openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> & v_all,
+                          openfpm::vector_gpu_ker<aggregate<int>,layout_base> & indexes)
+        :v_all(v_all),indexes(indexes)
+        {}
+
+        // get the
+
+		template <unsigned int p>
+		__device__ __host__ inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
+		{
+            return v_all.template get<p>(indexes.template get<0>(id));
+        }
+    }
+
+public:
+
+    template<typename T, typename Memory, template<typename> class layout_base, typename grow_p, unsigned int impl>
+    class vector_sub
+    {
+        vector<T,Memory,layout_base,grow_p,impl> & v_all;
+
+        vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes;
+
+        public:
+
+            vector(vector<T,Memory,layout_base,grow_p,impl> & v_all, 
+                   vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes)
+            :v_all(v_all),indexes(indexes)
+            {}
+
+
+            // To kernel
+            template<unsigned int ... prp> vector_dist_ker<dim,St,prop,layout_base> toKernel()
+            {
+                vector_subset_ker<dim,St,prop,layout_base> v(v_all.toKernel(), indexes.toKernel());
+
+                return v;
+            }
+
+		    template <unsigned int p>
+		    inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
+		    {
+                return v_all.template get<p>(indexes.template get<0>(id));
+            }
+
+    }
+
+};
+
+#endif
\ No newline at end of file