From 2ff916ddeacf8dbdeb34a77b5faa5b3239a66317 Mon Sep 17 00:00:00 2001
From: Serhii Yaskovets <yaskovet@sbalzarini-mac-38.mpi-cbg.de>
Date: Thu, 27 Apr 2023 12:10:37 +0200
Subject: [PATCH] Revert "Revert "Merge remote-tracking branch
 'origin/FD_solver'""

This reverts commit c1c3e0c4e8586bf5061dbabbfa48dd0c2dc86960.
---
 src/Grid/grid_base_implementation.hpp         |  2 +-
 src/Grid/grid_key.hpp                         |  2 +-
 src/Grid/grid_sm.hpp                          | 27 ++++--
 src/NN/CellList/CellList_gpu_test.cu          | 30 +++---
 .../CellList/cuda/CellDecomposer_gpu_ker.cuh  | 95 +++++++++++++++++--
 src/NN/CellList/cuda/CellList_gpu.hpp         | 89 +++++++++--------
 src/NN/CellList/cuda/CellList_gpu_ker.cuh     | 29 +++---
 .../cuda/Cuda_cell_list_util_func.hpp         | 60 ++++++------
 src/Space/Shape/Point.hpp                     |  2 +-
 .../performance/performancePlots.cpp          |  6 +-
 src/Vector/cuda/map_vector_cuda_ker.cuh       | 80 +++++++++++++++-
 src/Vector/cuda/map_vector_std_cuda.hpp       | 15 ++-
 src/Vector/cuda/map_vector_std_cuda_ker.cuh   | 21 ++++
 src/Vector/map_vector.hpp                     | 43 +++++++--
 src/Vector/map_vector_std.hpp                 | 10 ++
 src/Vector/vector_subset.hpp                  | 67 +++++++++++++
 src/util/mathutil.hpp                         | 35 ++++---
 17 files changed, 473 insertions(+), 140 deletions(-)
 create mode 100644 src/Vector/vector_subset.hpp

diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp
index 60392952..7a1a6693 100644
--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -752,7 +752,7 @@ public:
 		{
 			auto key = it.get();
 
-			if (this->get_o(key) != this->get_o(key))
+			if (this->get_o(key) != g.get_o(key))
 				return false;
 
 			++it;
diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp
index 106f7ee5..90fab130 100644
--- a/src/Grid/grid_key.hpp
+++ b/src/Grid/grid_key.hpp
@@ -397,7 +397,7 @@ public:
 	 *
 	 */
 	template<typename a, typename ...T>
-	inline void set(a v, T...t)
+	__device__ __host__ inline void set(a v, T...t)
 	{
 #ifdef SE_CLASS1
 		if (sizeof...(t) != dim -1)
diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index e4511d29..651608b2 100644
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -186,7 +186,7 @@ class grid_sm
 	 *
 	 */
 
-	inline void Initialize(const size_t sz)
+	__device__ __host__ inline void Initialize(const size_t sz)
 	{
 		//! Initialize the basic structure for each dimension
 		sz_s[0] = sz;
@@ -217,7 +217,7 @@ class grid_sm
 	 *
 	 */
 
-	inline void Initialize(const size_t (& sz)[N])
+	__device__ __host__ inline void Initialize(const size_t (& sz)[N])
 	{
 		//! Initialize the basic structure for each dimension
 		sz_s[0] = sz[0];
@@ -353,7 +353,8 @@ public:
 	 *
 	 */
 
-	template<typename S> inline grid_sm(const grid_sm<N,S> & g)
+	template<typename S>
+	__device__ __host__ inline grid_sm(const grid_sm<N,S> & g)
 	{
 		size_tot = g.size_tot;
 
@@ -380,7 +381,7 @@ public:
 
 	// Static element to calculate total size
 
-	inline size_t totalSize(const size_t (& sz)[N])
+	__device__ __host__ inline size_t totalSize(const size_t (& sz)[N])
 	{
 		size_t tSz = 1;
 
@@ -415,7 +416,7 @@ public:
 	 *
 	 */
 
-	inline grid_sm(const size_t (& sz)[N])
+	__device__ __host__ inline grid_sm(const size_t (& sz)[N])
 	: size_tot(totalSize(sz))
 	{
 		Initialize(sz);
@@ -625,7 +626,7 @@ public:
 	}
 
 	//! Destructor
-	~grid_sm() {};
+	__device__ __host__ ~grid_sm() {};
 
 	/*! \brief Return the size of the grid
 	 *
@@ -713,7 +714,7 @@ public:
 	 *
 	 */
 
-	inline size_t size_s(unsigned int i) const
+	__device__ __host__ inline size_t size_s(unsigned int i) const
 	{
 		return sz_s[i];
 	}
@@ -737,11 +738,21 @@ public:
 	 * \return get the size of the grid as an array
 	 *
 	 */
-	inline const size_t (& getSize() const)[N]
+	__device__ __host__ inline const size_t (& getSize() const)[N]
 	{
 		return sz;
 	}
 
+	/*! \brief Returns the size of the grid in the passed array
+	 *
+	 * \return the size of the grid in the passed array
+	 *
+	 */
+	__device__ __host__ inline void getSize(size_t (&size) [N]) const
+	{
+		for (size_t i = 0; i < N; ++i) size[i] = sz[i];
+	}
+
 	/*! \brief Return a sub-grid iterator
 	 *
 	 * Return a sub-grid iterator, to iterate through the grid
diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu
index 55ad9a05..2d19dd2a 100644
--- a/src/NN/CellList/CellList_gpu_test.cu
+++ b/src/NN/CellList/CellList_gpu_test.cu
@@ -83,17 +83,16 @@ void test_sub_index()
 
 	no_transform_only<dim,T> t(mt,pt);
 
+
 	CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
 																	spacing,
 																	off,
 																	t,
-																	pl.capacity(),
 																	pl.size(),
-																	part_ids.capacity(),
 																	0,
-																	static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																	pl.toKernel(),
+																	cl_n.toKernel(),
+																	part_ids.toKernel());
 
 	cl_n.template deviceToHost<0>();
 	part_ids.template deviceToHost<0>();
@@ -203,17 +202,16 @@ void test_sub_index2()
 
 	shift_only<dim,T> t(mt,pt);
 
+
 	CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
 																	spacing,
 																	off,
 																	t,
-																	pl.capacity(),
 																	pl.size(),
-																	part_ids.capacity(),
 																	0,
-																	static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-																	static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																	pl.toKernel(),
+																	cl_n.toKernel(),
+																	part_ids.toKernel());
 
 	cl_n.template deviceToHost<0>();
 	part_ids.template deviceToHost<0>();
@@ -387,11 +385,10 @@ void test_fill_cell()
 																				   div_c,
 																				   off,
 																				   part_ids.size(),
-																				   part_ids.capacity(),
 																				   0,
-																				   static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
-																				   static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
-																				   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
+																				   starts.toKernel(),
+																				   part_ids.toKernel(),
+																				   cells.toKernel() );
 
 	cells.template deviceToHost<0>();
 
@@ -666,7 +663,8 @@ void test_reorder_parts(size_t n_part)
 	// Here we test fill cell
 	CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
 			      decltype(pl.toKernel()),
-			      decltype(sort_to_not_sort.toKernel()),
+				  decltype(sort_to_not_sort.toKernel()),
+				  decltype(cells_out.toKernel()),
 			      cnt_type,
 			      shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
 			                                                  parts_prp.toKernel(),
@@ -675,7 +673,7 @@ void test_reorder_parts(size_t n_part)
 			                                                  pl_out.toKernel(),
 			                                                  sort_to_not_sort.toKernel(),
 			                                                  non_sort_to_sort.toKernel(),
-			                                                  static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
+			                                                  cells_out.toKernel());
 
 	bool check = true;
 	parts_prp_out.template deviceToHost<0>();
diff --git a/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh b/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh
index fc108d88..3981ce07 100644
--- a/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh
+++ b/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh
@@ -29,30 +29,71 @@ class CellDecomposer_gpu_ker
 	//! transformation
 	transform t;
 
+	//! Unit box of the Cell list
+	SpaceBox<dim,T> box_unit;
+
+	//! Grid structure of the Cell list
+	grid_sm<dim,void> gr_cell;
+
+	//! cell_shift
+	Point<dim,long int> cell_shift;
 public:
 
 	__device__ __host__ CellDecomposer_gpu_ker()
 	{}
 
-	__device__ __host__ CellDecomposer_gpu_ker(openfpm::array<T,dim,cnt_type> & spacing_c,
-	         	 	 	  openfpm::array<ids_type,dim,cnt_type> & div_c,
-	         	 	 	  openfpm::array<ids_type,dim,cnt_type> & off,
-	         	 	 	  const transform & t)
-	:spacing_c(spacing_c),div_c(div_c),off(off),t(t)
+	__device__ __host__ CellDecomposer_gpu_ker(
+		openfpm::array<T,dim,cnt_type> & spacing_c,
+		openfpm::array<ids_type,dim,cnt_type> & div_c,
+		openfpm::array<ids_type,dim,cnt_type> & off,
+		const transform & t)
+	: spacing_c(spacing_c),div_c(div_c),off(off),t(t)
 	{}
 
-	__host__ grid_sm<dim,void> getGrid()
+	__device__ __host__ CellDecomposer_gpu_ker(
+		openfpm::array<T,dim,cnt_type> & spacing_c,
+		openfpm::array<ids_type,dim,cnt_type> & div_c,
+		openfpm::array<ids_type,dim,cnt_type> & off,
+		const transform & t,
+		SpaceBox<dim,T> box_unit,
+		grid_sm<dim,void> gr_cell,
+		Point<dim,long int> cell_shift)
+	: spacing_c(spacing_c),div_c(div_c),off(off),t(t),
+	box_unit(box_unit), gr_cell(gr_cell), cell_shift(cell_shift)
+	{}
+
+	__device__ __host__ grid_sm<dim,void> getGrid()
 	{
 		size_t sz[dim];
 
 		for (size_t i = 0 ; i < dim ; i++)
 		{
-			sz[i] = div_c[i] + 2*off[i];
+			sz[i] = div_c[i];
 		}
 
 		return grid_sm<dim,void> (sz);
 	}
 
+	__device__ __host__ void getGridSize(size_t (& sz)[dim]) const
+	{
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			sz[i] = div_c[i] + 2*off[i];
+		}
+	}
+
+	template<typename ids_type2>
+	__device__ __host__ mem_id getGridLinId(const grid_key_dx<dim,ids_type2> & gk) const
+	{
+		mem_id lid = gk.get(0);
+		for (mem_id i = 1 ; i < dim ; i++)
+		{
+			lid += gk.get(i) * (div_c[i-1] + 2*off[i-1]);
+		}
+
+		return lid;
+	}
+
 	__device__ __host__ inline grid_key_dx<dim,ids_type> getCell(const Point<dim,T> & xp) const
 	{
 		return cid_<dim,cnt_type,ids_type,transform>::get_cid_key(spacing_c,off,t,xp);
@@ -82,6 +123,46 @@ public:
 	{
 		return t;
 	}
+
+	__device__ __host__ inline grid_key_dx<dim> getCellGrid(const T (& pos)[dim]) const
+	{
+		grid_key_dx<dim> key;
+		key.set_d(0,ConvertToID(pos,0));
+
+		for (size_t s = 1 ; s < dim ; s++)
+		{
+			key.set_d(s,ConvertToID(pos,s));
+		}
+
+		return key;
+	}
+
+	__device__ __host__ inline grid_key_dx<dim> getCellGrid(const Point<dim,T> & pos) const
+	{
+		grid_key_dx<dim> key;
+		key.set_d(0,ConvertToID(pos,0));
+
+		for (size_t s = 1 ; s < dim ; s++)
+		{
+			key.set_d(s,ConvertToID(pos,s));
+		}
+
+		return key;
+	}
+
+	__device__ __host__ inline size_t ConvertToID(const T (&x)[dim] ,size_t s) const
+	{
+		size_t id = openfpm::math::size_t_floor(t.transform(x,s) / box_unit.getHigh(s)) + off[s];
+		id = (id >= gr_cell.size(s))?(gr_cell.size(s)-1-cell_shift.get(s)):id-cell_shift.get(s);
+		return id;
+	}
+
+	__device__ __host__ inline size_t ConvertToID(const Point<dim,T> & x ,size_t s, size_t sc = 0) const
+	{
+		size_t id = openfpm::math::size_t_floor(t.transform(x,s) / box_unit.getHigh(s)) + off[s];
+		id = (id >= gr_cell.size(s))?(gr_cell.size(s)-1-cell_shift.get(s)):id-cell_shift.get(s);
+		return id;
+	}
 };
 
 #endif /* CELLDECOMPOSER_GPU_KER_HPP_ */
diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp
index c7623583..5ee0e2b0 100644
--- a/src/NN/CellList/cuda/CellList_gpu.hpp
+++ b/src/NN/CellList/cuda/CellList_gpu.hpp
@@ -44,7 +44,10 @@ struct CellList_gpu_ker_selector
 																			 openfpm::array<ids_type,dim,cnt_type> & div_c,
 																			 openfpm::array<ids_type,dim,cnt_type> & off,
 																			 const transform & t,
-																			 unsigned int g_m)
+																			 unsigned int g_m,
+																			 const SpaceBox<dim,T>& box_unit,
+																			 const grid_sm<dim,void>& gr_cell,
+																			 const Point<dim,long int>& cell_shift)
 	{
 		return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse>(starts.toKernel(),
 																			sorted_to_not_sorted.toKernel(),
@@ -54,7 +57,10 @@ struct CellList_gpu_ker_selector
 																			div_c,
 																			off,
 																			t,
-																			g_m);
+																			g_m,
+																			box_unit,
+																			gr_cell,
+																			cell_shift);
 	}
 };
 
@@ -73,10 +79,14 @@ struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector
 			 vector_cnt_type & dprt,
 			 openfpm::vector<aggregate<int>,Memory,memory_traits_inte> & nnc_rad,
 			 openfpm::array<T,dim,cnt_type> & spacing_c,
-	         openfpm::array<ids_type,dim,cnt_type> & div_c,
-	         openfpm::array<ids_type,dim,cnt_type> & off,
-	         const transform & t,
-	         unsigned int g_m)
+			 openfpm::array<ids_type,dim,cnt_type> & div_c,
+			 openfpm::array<ids_type,dim,cnt_type> & off,
+			 const transform & t,
+			 unsigned int g_m,
+			 const SpaceBox<dim,T>& box_unit,
+			 const grid_sm<dim,void>& gr_cell,
+			 const Point<dim,long int>& cell_shift)
+
 	{
 		return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,true>(cell_nn.toKernel(),
 																		cell_nn_list.toKernel(),
@@ -86,7 +96,11 @@ struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector
 																		spacing_c,
 																		div_c,
 																		off,
-																		t,g_m);
+																		t,
+																		g_m,
+																		box_unit,
+																		gr_cell,
+																		cell_shift);
 	}
 };
 
@@ -247,13 +261,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																		spacing_c,
 																		off,
 																		this->getTransform(),
-																		pl.capacity(),
 																		pl.size(),
-																		part_ids.capacity(),
 																		start,
-																		static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																		pl.toKernel(),
+																		starts.toKernel(),
+																		part_ids.toKernel());
 
 		// now we construct the cells
 
@@ -292,6 +304,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 		CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 				      decltype(pl.toKernel()),
 				      decltype(sorted_to_not_sorted.toKernel()),
+					  decltype(cells.toKernel()),
 				      cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
 				                                                           pl_prp.toKernel(),
 				                                                           pl_prp_out.toKernel(),
@@ -299,7 +312,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 				                                                           pl_out.toKernel(),
 				                                                           sorted_to_not_sorted.toKernel(),
 				                                                           non_sorted_to_sorted.toKernel(),
-				                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
+				                                                           cells.toKernel());
 
 		if (opt == cl_construct_opt::Full)
 		{
@@ -377,13 +390,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																		spacing_c,
 																		off,
 																		this->getTransform(),
-																		pl.capacity(),
 																		stop,
-																		part_ids.capacity(),
 																		start,
-																		static_cast<T *>(pl.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
-																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
+																		pl.toKernel(),
+																		cl_n.toKernel(),
+																		part_ids.toKernel());
 
 		// now we scan
 		starts.resize(cl_n.size());
@@ -399,7 +410,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
                                                                                             part_ids.size(),
-                                                                                            static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
+                                                                                            cells.toKernel()) );
 
                 // sort
 
@@ -408,14 +419,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 #else
 
                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
-                                                                                                                                                                           div_c,
-                                                                                                                                                                           off,
-                                                                                                                                                                           part_ids.size(),
-                                                                                                                                                                           part_ids.capacity(),
-																					   start,
-                                                                                                                                                                           static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
-                                                                                                                                                                           static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
-                                                                                                                                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
+                                                                                    div_c,
+                                                                                    off,
+																					part_ids.size(),
+																					start,
+                                                                                    starts.toKernel(),
+																					part_ids.toKernel(),
+																					cells.toKernel() );
 
 #endif
 
@@ -431,6 +441,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 						  decltype(pl.toKernel()),
 						  decltype(sorted_to_not_sorted.toKernel()),
+						  decltype(cells.toKernel()),
 						  cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
 																			   pl_prp.toKernel(),
 																			   pl_prp_out.toKernel(),
@@ -438,7 +449,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																			   pl_out.toKernel(),
 																			   sorted_to_not_sorted.toKernel(),
 																			   non_sorted_to_sorted.toKernel(),
-																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
+																			   cells.toKernel());
 		}
 		else
 		{
@@ -446,6 +457,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()),
 						  decltype(pl.toKernel()),
 						  decltype(sorted_to_not_sorted.toKernel()),
+						  decltype(cells.toKernel()),
 						  cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(),
 																			   pl_prp.toKernel(),
 																			   pl_prp_out.toKernel(),
@@ -453,7 +465,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																			   pl_out.toKernel(),
 																			   sorted_to_not_sorted.toKernel(),
 																			   non_sorted_to_sorted.toKernel(),
-																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
+																			   cells.toKernel());
 		}
 
 		if (opt == cl_construct_opt::Full)
@@ -648,14 +660,17 @@ public:
 				cells_nn,
 				cells_nn_list,
 				cl_sparse,
-		    	sorted_to_not_sorted,
-		    	sorted_domain_particles_ids,
-		    	nnc_rad,
-		        spacing_c,
-		        div_c,
-		        off,
-		        this->getTransform(),
-		        g_m);
+				sorted_to_not_sorted,
+				sorted_domain_particles_ids,
+				nnc_rad,
+				spacing_c,
+				div_c,
+				off,
+				this->getTransform(),
+				g_m,
+				this->box_unit,
+				this->gr_cell,
+				this->cell_shift);
 	}
 
 	/*! \brief Clear the structure
diff --git a/src/NN/CellList/cuda/CellList_gpu_ker.cuh b/src/NN/CellList/cuda/CellList_gpu_ker.cuh
index 004716a7..becb50c5 100644
--- a/src/NN/CellList/cuda/CellList_gpu_ker.cuh
+++ b/src/NN/CellList/cuda/CellList_gpu_ker.cuh
@@ -449,11 +449,14 @@ public:
 					 openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt,
 					 openfpm::vector_gpu_ker<aggregate<int>,memory_traits_inte> rad_cells,
 					 openfpm::array<T,dim,cnt_type> & spacing_c,
-			         openfpm::array<ids_type,dim,cnt_type> & div_c,
-			         openfpm::array<ids_type,dim,cnt_type> & off,
-			         const transform & t,
-			         unsigned int g_m)
-	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t),
+					 openfpm::array<ids_type,dim,cnt_type> & div_c,
+					 openfpm::array<ids_type,dim,cnt_type> & off,
+					 const transform & t,
+					 unsigned int g_m,
+					 SpaceBox<dim,T> box_unit,
+					 grid_sm<dim,void> gr_cell,
+					 Point<dim,long int> cell_shift)
+	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t,box_unit,gr_cell,cell_shift),
 	 starts(starts),srt(srt),dprt(dprt),rad_cells(rad_cells),g_m(g_m)
 	{
 	}
@@ -616,12 +619,16 @@ public:
 					 openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> srt,
 					 openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt,
 					 openfpm::array<T,dim,cnt_type> & spacing_c,
-			         openfpm::array<ids_type,dim,cnt_type> & div_c,
-			         openfpm::array<ids_type,dim,cnt_type> & off,
-			         const transform & t,
-			         unsigned int g_m)
-	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t),cell_nn(cell_nn),cell_nn_list(cell_nn_list),srt(srt),dprt(dprt),
-	 cl_sparse(cl_sparse),g_m(g_m)
+					 openfpm::array<ids_type,dim,cnt_type> & div_c,
+					 openfpm::array<ids_type,dim,cnt_type> & off,
+					 const transform & t,
+					 unsigned int g_m,
+					 SpaceBox<dim,T> box_unit,
+					 grid_sm<dim,void> gr_cell,
+					 Point<dim,long int> cell_shift)
+
+	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t,box_unit,gr_cell,cell_shift),
+	cell_nn(cell_nn),cell_nn_list(cell_nn_list),srt(srt),dprt(dprt),cl_sparse(cl_sparse),g_m(g_m)
 	{
 	}
 
diff --git a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
index 0d7c86a9..e50335e7 100644
--- a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
+++ b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
@@ -258,18 +258,17 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid)
 #ifdef __NVCC__
 
 template<bool is_sparse,unsigned int dim, typename pos_type,
-         typename cnt_type, typename ids_type, typename transform>
+         typename cnt_type, typename ids_type, typename transform,
+		 typename vector_pos_type, typename vector_cnt_type, typename vector_pids_type>
 __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
 						 openfpm::array<pos_type,dim,cnt_type> spacing,
 						 openfpm::array<ids_type,dim,cnt_type> off,
 						 transform t,
-						 int n_cap,
 						 int n_part,
-						 int n_cap2,
 						 cnt_type start,
-						 pos_type * p_pos,
-						 cnt_type *counts,
-						 cnt_type * p_ids)
+						 vector_pos_type p_pos,
+						 vector_cnt_type counts,
+						 vector_pids_type p_ids)
 {
     cnt_type i, cid, ins;
     ids_type e[dim+1];
@@ -281,23 +280,23 @@ __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
     pos_type p[dim];
 
     for (size_t k = 0 ; k < dim ; k++)
-    {p[k] = p_pos[i+k*n_cap];}
+    {p[k] = p_pos.template get<0>(i)[k];}
 
     cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e);
 
     if (is_sparse == false)
     {
-    	e[dim] = atomicAdd(counts + cid, 1);
+    	e[dim] = atomicAdd(&counts.template get<0>(cid), 1);
 
-    	p_ids[ins] = cid;
-        {p_ids[ins+1*(n_cap2)] = e[dim];}
+    	p_ids.template get<0>(ins)[0] = cid;
+        {p_ids.template get<0>(ins)[1] = e[dim];}
     }
     else
     {
-        p_ids[ins] = cid;
-        {p_ids[ins+1*(n_cap2)] = e[dim];}
+        p_ids.template get<0>(ins)[0] = cid;
+        {p_ids.template get<0>(ins)[1] = e[dim];}
 
-        counts[ins] = cid;
+        counts.template get<0>(ins) = cid;
     }
 }
 
@@ -353,43 +352,42 @@ __global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div
 
 #ifdef MAKE_CELLLIST_DETERMINISTIC
 
-template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
+template<unsigned int dim, typename cnt_type, typename ids_type, typename ph, typename vector_cells_type>
 __global__ void fill_cells(cnt_type phase_id ,
 						   cnt_type n,
-		                   cnt_type *cells)
+		                   vector_cells_type cells)
 {
     cnt_type i;
 
     i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cells[i] = encode_phase_id<cnt_type,ph>(phase_id,i);
+    cells.template get<0>(i) = encode_phase_id<cnt_type,ph>(phase_id,i);
 }
 
 #else
 
-template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
+template<unsigned int dim, typename cnt_type, typename ids_type, typename ph,typename vector_starts_type, typename vector_pids_type, typename vector_cells_type>
 __global__ void fill_cells(cnt_type phase_id ,
 		                   openfpm::array<ids_type,dim,cnt_type> div_c,
 		                   openfpm::array<ids_type,dim,cnt_type> off,
 		                   cnt_type n,
-		                   cnt_type n_cap,
 		                   cnt_type start_p,
-		                   const cnt_type *starts,
-		                   const cnt_type * p_ids,
-		                   cnt_type *cells)
+		                   vector_starts_type starts,
+		                   vector_pids_type p_ids,
+		                   vector_cells_type cells)
 {
     cnt_type i, cid, id, start;
 
     i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cid = p_ids[i];
+    cid = p_ids.template get<0>(i)[0];
 
-    start = starts[cid];
-    id = start + p_ids[1*n_cap+i];
+    start = starts.template get<0>(cid);
+    id = start + p_ids.template get<0>(i)[1];
 
-    cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
+    cells.template get<0>(id) = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
 }
 
 #endif
@@ -424,7 +422,7 @@ __device__ inline void reorder_wprp(const vector_prp & input,
 	output.template set<prp ...>(dst_id,input,src_id);
 }
 
-template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh>
+template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh>
 __global__ void reorder_parts(int n,
 		                      const vector_prp input,
 		                      vector_prp output,
@@ -432,12 +430,12 @@ __global__ void reorder_parts(int n,
 		                      vector_pos output_pos,
 		                      vector_ns sorted_non_sorted,
 		                      vector_ns non_sorted_to_sorted,
-		                      const cnt_type * cells)
+		                      const vector_cells_type cells)
 {
     cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cnt_type code = cells[i];
+    cnt_type code = cells.template get<0>(i);
 
     reorder(input, output, code,i);
     reorder(input_pos,output_pos,code,i);
@@ -446,7 +444,7 @@ __global__ void reorder_parts(int n,
     non_sorted_to_sorted.template get<0>(code) = i;
 }
 
-template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh, unsigned int ... prp>
+template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh, unsigned int ... prp>
 __global__ void reorder_parts_wprp(int n,
 		                      const vector_prp input,
 		                      vector_prp output,
@@ -454,12 +452,12 @@ __global__ void reorder_parts_wprp(int n,
 		                      vector_pos output_pos,
 		                      vector_ns sorted_non_sorted,
 		                      vector_ns non_sorted_to_sorted,
-		                      const cnt_type * cells)
+		                      const vector_cells_type cells)
 {
     cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cnt_type code = cells[i];
+    cnt_type code = cells.template get<0>(i);
     reorder_wprp<vector_prp,cnt_type,prp...>(input, output, code,i);
     reorder(input_pos,output_pos,code,i);
 
diff --git a/src/Space/Shape/Point.hpp b/src/Space/Shape/Point.hpp
index 91f62252..6e49d0e9 100644
--- a/src/Space/Shape/Point.hpp
+++ b/src/Space/Shape/Point.hpp
@@ -416,7 +416,7 @@ template<unsigned int dim ,typename T> class Point
 	 * \return the reference
 	 *
 	 */
-	T & value(size_t i)
+	__device__ __host__ T & value(size_t i)
 	{
 		return get(i);
 	}
diff --git a/src/SparseGridGpu/performance/performancePlots.cpp b/src/SparseGridGpu/performance/performancePlots.cpp
index 01a70924..3cd3b0e4 100644
--- a/src/SparseGridGpu/performance/performancePlots.cpp
+++ b/src/SparseGridGpu/performance/performancePlots.cpp
@@ -45,6 +45,8 @@ void write_test_report(report_sparse_grid_tests &report_sparsegrid_funcs, std::s
     plotGetSingle2D(report_sparsegrid_funcs, testSet, plotCounter);
     plotGetNeighbourhood2D(report_sparsegrid_funcs, testSet, plotCounter);
 
+
+
     std::string file_xml_results(test_dir);
     file_xml_results += std::string("/") + std::string(perfResultsXmlFile);
 
@@ -310,7 +312,7 @@ void plotDense3D(report_sparse_grid_tests &report_sparsegrid_funcs, std::set<std
                                            base + ".gridScaling(#).GFlops.mean");
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").x.data(0).source",
                                            base + ".gridScaling(#).gridSize.x");
-   report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x", true);
+        report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x", true);
         int bes = static_cast<int>( report_sparsegrid_funcs.graphs.template get<double>(
                 base + ".gridScaling(0).blockSize"));
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").y.data(0).title",
@@ -330,7 +332,7 @@ void plotDense3D(report_sparse_grid_tests &report_sparsegrid_funcs, std::set<std
                                            base + ".blockScaling(#).GFlops.mean");
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").x.data(0).source",
                                            base + ".blockScaling(#).blockSize");
-   report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x",true);
+        report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x",true);
         int ges = static_cast<int>( report_sparsegrid_funcs.graphs.template get<double>(
                 base + ".blockScaling(0).gridSize.x"));
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").y.data(0).title",
diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh
index ea202a7f..fef74515 100644
--- a/src/Vector/cuda/map_vector_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_cuda_ker.cuh
@@ -172,6 +172,11 @@ namespace openfpm
 			return v_size;
 		}
 
+        __host__ __device__ size_t size_local() const
+        {
+            return size();
+        }
+
 		/*! \brief return the maximum capacity of the vector before reallocation
 		 *
 		 * \return the capacity of the vector
@@ -204,6 +209,38 @@ namespace openfpm
 
 			return base.template get<p>(key);
 		}
+        /*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \tparam p Property to get
+		 * \param id Element to get
+		 *
+		 * \return the element value requested
+		 *
+		 */
+        template <unsigned int p>
+        __device__ __host__ inline auto getProp(unsigned int id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+        {
+            return this->get<p>(id);
+        }
+
+
+        /*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \tparam p Property to get
+		 * \param id Element to get
+		 *
+		 * \return the element value requested
+		 *
+		 */
+        template <unsigned int p, typename key_type>
+        __device__ __host__ inline auto getProp(key_type id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+        {
+            return this->get<p>(id.getKey());
+        }
 
 		/*! \brief Get an element of the vector
 		 *
@@ -488,7 +525,22 @@ namespace openfpm
 
 			return base.getGPUIterator(start,stop,n_thr);
 		}
-
+        /*! \brief Get a domain iterator for the GPU
+         *
+         *
+         */
+        ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
+        {
+            return getGPUIterator(n_thr);
+        }
+
+        //Stub for some expression
+        void init() const {}
+
+        __host__ __device__ auto value(unsigned int p) -> decltype(base.template get<0>(grid_key_dx<1>(0)))
+        {
+            return get<0>(p);
+        }
 		/*! \brief Get an iterator for the GPU
 		 *
 		 *
@@ -501,6 +553,7 @@ namespace openfpm
 			return base.getGPUIterator(start,stop_,n_thr);
 		}
 
+
 		/*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers
 		 *
 		 * \param object to copy
@@ -514,6 +567,16 @@ namespace openfpm
 			return *this;
 		}
 
+        __device__ __host__ vector_gpu_ker<T,layout_base> & getVector()
+        {
+            return *this;
+        }
+
+        __device__ __host__ const vector_gpu_ker<T,layout_base> & getVector() const
+        {
+            return *this;
+        }
+
 		/*! \brief Return the base
 		 *
 		 * \return the base
@@ -596,6 +659,11 @@ namespace openfpm
 			return vref.size();
 		}
 
+        __host__ __device__ size_t size_local() const
+        {
+            return size();
+        }
+
 		__device__ __host__ unsigned int capacity() const
 		{
 			return vref.capacity;
@@ -692,6 +760,16 @@ namespace openfpm
 			return vref.getGPUItertatorTo(stop,n_thr);
 		}
 
+        vector_gpu_ker<T,layout_base> & getVector()
+        {
+         return *this;
+        }
+
+        const vector_gpu_ker<T,layout_base> & getVector() const
+        {
+            return *this;
+        }
+
 		__host__ vector_gpu_ker_ref<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
 		{
 			vref.operator=(v);
diff --git a/src/Vector/cuda/map_vector_std_cuda.hpp b/src/Vector/cuda/map_vector_std_cuda.hpp
index 319add7b..d384a51f 100644
--- a/src/Vector/cuda/map_vector_std_cuda.hpp
+++ b/src/Vector/cuda/map_vector_std_cuda.hpp
@@ -277,8 +277,11 @@ public:
 	 *
 	 */
 	vector(const std::initializer_list<T> & v)
-	:base(v)
+	// :base(v)
 	{
+		// causes error if simply passed to base
+		for (T t: v)
+			add(t);
 	}
 
 	//! Constructor from another vector
@@ -288,6 +291,16 @@ public:
 		base.swap(v.base);
 	}
 
+	//! Constructor from iterators
+	template< class InputIt >
+	vector(InputIt first, InputIt last)
+	{
+		while(first != last) {
+			add(*first);
+			++first;
+		}
+	}
+
 	//! destructor
 	~vector() noexcept
 	{
diff --git a/src/Vector/cuda/map_vector_std_cuda_ker.cuh b/src/Vector/cuda/map_vector_std_cuda_ker.cuh
index cb1b054f..ea583826 100644
--- a/src/Vector/cuda/map_vector_std_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_std_cuda_ker.cuh
@@ -126,6 +126,27 @@ public:
 		return base.template get<0>(id);
 	}
 
+	/*! \brief Return the pointer to the chunk of memory
+	 *
+	 * \return the pointer to the chunk of memory
+	 *
+	 */
+	__device__ __host__ void * getPointer()
+	{
+		return &base.template get<0>(0);
+	}
+
+	/*! \brief Return the pointer to the chunk of memory
+	 *
+	 * \return the pointer to the chunk of memory
+	 *
+	 */
+	__device__ __host__ const void * getPointer() const
+	{
+		return &base.template get<0>(0);
+	}
+
+
 	vector_custd_ker()
 	{}
 
diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp
index 953fdeab..078ffd17 100644
--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -336,7 +336,7 @@ namespace openfpm
         *
         * \return the size
         *
-        */
+        */ //remove host device
         size_t size_local() const
         {
             return v_size;
@@ -1359,7 +1359,21 @@ namespace openfpm
 
 			return base.get_o(key);
 		}
-
+        /*! \brief Get an element of the vector
+         *
+         * Get an element of the vector
+         *
+         * \tparam p Property to get
+         * \param id Element to get
+         *
+         * \return the element value requested
+         *
+         */ //remove device host
+        template <unsigned int p>
+        inline auto getProp(const unsigned int & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+        {   //uncomment this
+            return this->template get<p>(id);
+        }
 		/*! \brief Get an element of the vector
 		 *
 		 * Get an element of the vector
@@ -1369,12 +1383,12 @@ namespace openfpm
 		 *
 		 * \return the element value requested
 		 *
-		 */
-
-		template <unsigned int p,typename KeyType>
-		inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		 *///remove host device
+        template <unsigned int p,typename KeyType>
+        inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
 		{
-			return this->template get<p>(id.getKey());
+            //uncomment this
+            return this->template get<p>(id.getKey());
 		}
 
 		/*! \brief Get an element of the vector
@@ -1386,10 +1400,10 @@ namespace openfpm
 		 *
 		 * \return the element value requested
 		 *
-		 */
+		 */ //remove device host
 		template <unsigned int p, typename keyType>
-		inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
-		{
+         inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		{   //uncomment this
 			return this->template get<p>(id.getKey());
 		}
 
@@ -1930,6 +1944,7 @@ namespace openfpm
 			return base.getGPUIterator(start,stop_,n_thr);
 		}
 
+
 #endif
 
 		/*! \brief Get the vector elements iterator
@@ -1987,6 +2002,14 @@ namespace openfpm
 
 			return base.getGPUIterator(start,stop,n_thr);
 		}
+        /*! \brief Get a domain iterator for the GPU
+         *
+         *
+         */
+        ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
+        {
+            return getGPUIterator(n_thr);
+        }
 
 #endif
 		/*! \brief Return the size of the message needed to pack this object
diff --git a/src/Vector/map_vector_std.hpp b/src/Vector/map_vector_std.hpp
index 2cf717e2..ae0c08f1 100644
--- a/src/Vector/map_vector_std.hpp
+++ b/src/Vector/map_vector_std.hpp
@@ -644,6 +644,16 @@ public:
 		base.swap(v.base);
 	}
 
+	//! Constructor from iterators
+	template< class InputIt >
+	vector(InputIt first, InputIt last)
+	{
+		while(first != last) {
+			add(*first);
+			++first;
+		}
+	}
+
 	//! destructor
 	~vector() noexcept
 	{
diff --git a/src/Vector/vector_subset.hpp b/src/Vector/vector_subset.hpp
new file mode 100644
index 00000000..d6d78b69
--- /dev/null
+++ b/src/Vector/vector_subset.hpp
@@ -0,0 +1,67 @@
+#ifndef VECTOR_SUBSET_HPP
+#define VECTOR_SUBSET_HPP
+
+namespace openfpm
+{
+
+template<unsigned int dim,
+         typename prop,
+         template<typename> class layout_base = memory_traits_inte>
+    class vector_subset_ker
+    {
+        mutable openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> v_all;
+
+        mutable openfpm::vector_gpu_ker<aggregate<int>,layout_base> indexes;
+
+        public:
+
+        vector_subset_ker(openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> & v_all,
+                          openfpm::vector_gpu_ker<aggregate<int>,layout_base> & indexes)
+        :v_all(v_all),indexes(indexes)
+        {}
+
+        // get the
+
+		template <unsigned int p>
+		__device__ __host__ inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
+		{
+            return v_all.template get<p>(indexes.template get<0>(id));
+        }
+    }
+
+public:
+
+    template<typename T, typename Memory, template<typename> class layout_base, typename grow_p, unsigned int impl>
+    class vector_sub
+    {
+        vector<T,Memory,layout_base,grow_p,impl> & v_all;
+
+        vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes;
+
+        public:
+
+            vector(vector<T,Memory,layout_base,grow_p,impl> & v_all, 
+                   vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes)
+            :v_all(v_all),indexes(indexes)
+            {}
+
+
+            // To kernel
+            template<unsigned int ... prp> vector_dist_ker<dim,St,prop,layout_base> toKernel()
+            {
+                vector_subset_ker<dim,St,prop,layout_base> v(v_all.toKernel(), indexes.toKernel());
+
+                return v;
+            }
+
+		    template <unsigned int p>
+		    inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
+		    {
+                return v_all.template get<p>(indexes.template get<0>(id));
+            }
+
+    }
+
+};
+
+#endif
\ No newline at end of file
diff --git a/src/util/mathutil.hpp b/src/util/mathutil.hpp
index 1b05785a..e813f30b 100644
--- a/src/util/mathutil.hpp
+++ b/src/util/mathutil.hpp
@@ -19,7 +19,7 @@ namespace openfpm
 		 *
 		 */
 		template <typename T>
-		int sgn(T val)
+		__host__ __device__ int sgn(T val)
 		{
 			return (T(0) < val) - (val < T(0));
 		}
@@ -35,7 +35,8 @@ namespace openfpm
 		 * \return the factorial
 		 *
 		 */
-		static inline size_t factorial(size_t f)
+		__host__ __device__ static inline
+		size_t factorial(size_t f)
 		{
 			size_t fc = 1;
 
@@ -60,7 +61,8 @@ namespace openfpm
 		 * \param k
 		 *
 		 */
-		static inline size_t C(size_t n, size_t k)
+		__host__ __device__ static inline
+		size_t C(size_t n, size_t k)
 		{
 			return factorial(n)/(factorial(k)*factorial(n-k));
 		}
@@ -75,7 +77,8 @@ namespace openfpm
 		 *
 		 *
 		 */
-		static inline size_t round_big_2(size_t n)
+		__host__ __device__ static inline
+		size_t round_big_2(size_t n)
 		{
 			n--;
 			n |= n >> 1;   // Divide by 2^k for consecutive doublings of k up to 32,
@@ -102,14 +105,15 @@ namespace openfpm
 		 */
 
 		template<class T>
-		inline constexpr size_t pow(const T base, unsigned const exponent)
+		__host__ __device__ inline constexpr
+		size_t pow(const T base, unsigned const exponent)
 		{
 			// (parentheses not required in next line)
 			return (exponent == 0) ? 1 : (base * pow(base, exponent-1));
 		}
 
         template<class T>
-        double intpowlog(const T x, unsigned const e)
+        __host__ __device__ double intpowlog(const T x, unsigned const e)
         {
             if (e == 0) return 1.0;
             if (e % 2 == 0)
@@ -135,7 +139,8 @@ namespace openfpm
 		 * \param n modulo
 		 *
 		 */
-		static inline long int positive_modulo(long int i, long int n)
+		__host__ __device__ static inline
+		long int positive_modulo(long int i, long int n)
 		{
 		    return (i % n + n) % n;
 		}
@@ -155,7 +160,9 @@ namespace openfpm
 		 * \return the bound number
 		 *
 		 */
-		template<typename T> static inline T periodic(const T & pos, const T & p2, const T & p1)
+		template<typename T>
+		__host__ __device__ static inline
+		T periodic(const T & pos, const T & p2, const T & p1)
 		{
 			T pos_tmp;
 
@@ -182,7 +189,9 @@ namespace openfpm
 		 * \return the bound number
 		 *
 		 */
-		template<typename T> __device__ __host__ static inline T periodic_l(const T & pos, const T & p2, const T & p1)
+		template<typename T>
+		__device__ __host__ static inline
+		T periodic_l(const T & pos, const T & p2, const T & p1)
 		{
 			T pos_tmp = pos;
 
@@ -207,7 +216,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		inline long int size_t_floor(double x)
+		__host__ __device__ inline long int size_t_floor(double x)
 		{
 		  size_t i = (long int)x; /* truncate */
 		  return i - ( i > x ); /* convert trunc to floor */
@@ -217,7 +226,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		inline long int size_t_floor(float x)
+		__host__ __device__ inline long int size_t_floor(float x)
 		{
 		  size_t i = (long int)x; /* truncate */
 		  return i - ( i > x ); /* convert trunc to floor */
@@ -258,7 +267,7 @@ namespace openfpm
 		 * \param value
 		 *
 		 */
-		inline int log2_64 (uint64_t value)
+		__host__ __device__ inline int log2_64 (uint64_t value)
 		{
 		    value |= value >> 1;
 		    value |= value >> 2;
@@ -276,7 +285,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		inline long int size_t_floor(boost::multiprecision::float128 x)
+		__host__ __device__ inline long int size_t_floor(boost::multiprecision::float128 x)
 		{
 		  size_t i = (long int)x; /* truncate */
 		  return i - ( i > x ); /* convert trunc to floor */
-- 
GitLab