From c1c3e0c4e8586bf5061dbabbfa48dd0c2dc86960 Mon Sep 17 00:00:00 2001
From: Serhii Yaskovets <yaskovet@mpi-cbg.de>
Date: Tue, 11 Apr 2023 14:54:52 +0200
Subject: [PATCH] Revert "Merge remote-tracking branch 'origin/FD_solver'"

This reverts commit 94125e792a7edc2d3927453d6563a66342ed5ebc, reversing
changes made to f61deb2e21ac22abfb96d75e6874a8842bbb38e8.
---
 src/Grid/grid_base_implementation.hpp         |  2 +-
 src/Grid/grid_key.hpp                         |  2 +-
 src/Grid/grid_sm.hpp                          | 27 ++----
 src/NN/CellList/CellList_gpu_test.cu          | 30 +++---
 .../CellList/cuda/CellDecomposer_gpu_ker.cuh  | 95 ++-----------------
 src/NN/CellList/cuda/CellList_gpu.hpp         | 89 ++++++++---------
 src/NN/CellList/cuda/CellList_gpu_ker.cuh     | 29 +++---
 .../cuda/Cuda_cell_list_util_func.hpp         | 60 ++++++------
 src/Space/Shape/Point.hpp                     |  2 +-
 .../performance/performancePlots.cpp          |  6 +-
 src/Vector/cuda/map_vector_cuda_ker.cuh       | 80 +---------------
 src/Vector/cuda/map_vector_std_cuda.hpp       | 15 +--
 src/Vector/cuda/map_vector_std_cuda_ker.cuh   | 21 ----
 src/Vector/map_vector.hpp                     | 43 ++-------
 src/Vector/map_vector_std.hpp                 | 10 --
 src/Vector/vector_subset.hpp                  | 67 -------------
 src/util/mathutil.hpp                         | 35 +++----
 17 files changed, 140 insertions(+), 473 deletions(-)
 delete mode 100644 src/Vector/vector_subset.hpp

diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp
index 7a1a6693..60392952 100644
--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -752,7 +752,7 @@ public:
 		{
 			auto key = it.get();
 
-			if (this->get_o(key) != g.get_o(key))
+			if (this->get_o(key) != this->get_o(key))
 				return false;
 
 			++it;
diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp
index 90fab130..106f7ee5 100644
--- a/src/Grid/grid_key.hpp
+++ b/src/Grid/grid_key.hpp
@@ -397,7 +397,7 @@ public:
 	 *
 	 */
 	template<typename a, typename ...T>
-	__device__ __host__ inline void set(a v, T...t)
+	inline void set(a v, T...t)
 	{
 #ifdef SE_CLASS1
 		if (sizeof...(t) != dim -1)
diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index 651608b2..e4511d29 100644
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -186,7 +186,7 @@ class grid_sm
 	 *
 	 */
 
-	__device__ __host__ inline void Initialize(const size_t sz)
+	inline void Initialize(const size_t sz)
 	{
 		//! Initialize the basic structure for each dimension
 		sz_s[0] = sz;
@@ -217,7 +217,7 @@ class grid_sm
 	 *
 	 */
 
-	__device__ __host__ inline void Initialize(const size_t (& sz)[N])
+	inline void Initialize(const size_t (& sz)[N])
 	{
 		//! Initialize the basic structure for each dimension
 		sz_s[0] = sz[0];
@@ -353,8 +353,7 @@ public:
 	 *
 	 */
 
-	template<typename S>
-	__device__ __host__ inline grid_sm(const grid_sm<N,S> & g)
+	template<typename S> inline grid_sm(const grid_sm<N,S> & g)
 	{
 		size_tot = g.size_tot;
 
@@ -381,7 +380,7 @@ public:
 
 	// Static element to calculate total size
 
-	__device__ __host__ inline size_t totalSize(const size_t (& sz)[N])
+	inline size_t totalSize(const size_t (& sz)[N])
 	{
 		size_t tSz = 1;
 
@@ -416,7 +415,7 @@ public:
 	 *
 	 */
 
-	__device__ __host__ inline grid_sm(const size_t (& sz)[N])
+	inline grid_sm(const size_t (& sz)[N])
 	: size_tot(totalSize(sz))
 	{
 		Initialize(sz);
@@ -626,7 +625,7 @@ public:
 	}
 
 	//! Destructor
-	__device__ __host__ ~grid_sm() {};
+	~grid_sm() {};
 
 	/*! \brief Return the size of the grid
 	 *
@@ -714,7 +713,7 @@ public:
 	 *
 	 */
 
-	__device__ __host__ inline size_t size_s(unsigned int i) const
+	inline size_t size_s(unsigned int i) const
 	{
 		return sz_s[i];
 	}
@@ -738,21 +737,11 @@ public:
 	 * \return get the size of the grid as an array
 	 *
 	 */
-	__device__ __host__ inline const size_t (& getSize() const)[N]
+	inline const size_t (& getSize() const)[N]
 	{
 		return sz;
 	}
 
-	/*! \brief Returns the size of the grid in the passed array
-	 *
-	 * \return the size of the grid in the passed array
-	 *
-	 */
-	__device__ __host__ inline void getSize(size_t (&size) [N]) const
-	{
-		for (size_t i = 0; i < N; ++i) size[i] = sz[i];
-	}
-
 	/*! \brief Return a sub-grid iterator
 	 *
 	 * Return a sub-grid iterator, to iterate through the grid
diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu
index 2d19dd2a..55ad9a05 100644
--- a/src/NN/CellList/CellList_gpu_test.cu
+++ b/src/NN/CellList/CellList_gpu_test.cu
@@ -83,16 +83,17 @@ void test_sub_index()
 
 	no_transform_only<dim,T> t(mt,pt);
 
-
 	CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
 																	spacing,
 																	off,
 																	t,
+																	pl.capacity(),
 																	pl.size(),
+																	part_ids.capacity(),
 																	0,
-																	pl.toKernel(),
-																	cl_n.toKernel(),
-																	part_ids.toKernel());
+																	static_cast<T *>(pl.template getDeviceBuffer<0>()),
+																	static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
+																	static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
 
 	cl_n.template deviceToHost<0>();
 	part_ids.template deviceToHost<0>();
@@ -202,16 +203,17 @@ void test_sub_index2()
 
 	shift_only<dim,T> t(mt,pt);
 
-
 	CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
 																	spacing,
 																	off,
 																	t,
+																	pl.capacity(),
 																	pl.size(),
+																	part_ids.capacity(),
 																	0,
-																	pl.toKernel(),
-																	cl_n.toKernel(),
-																	part_ids.toKernel());
+																	static_cast<T *>(pl.template getDeviceBuffer<0>()),
+																	static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
+																	static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
 
 	cl_n.template deviceToHost<0>();
 	part_ids.template deviceToHost<0>();
@@ -385,10 +387,11 @@ void test_fill_cell()
 																				   div_c,
 																				   off,
 																				   part_ids.size(),
+																				   part_ids.capacity(),
 																				   0,
-																				   starts.toKernel(),
-																				   part_ids.toKernel(),
-																				   cells.toKernel() );
+																				   static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
+																				   static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
+																				   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
 
 	cells.template deviceToHost<0>();
 
@@ -663,8 +666,7 @@ void test_reorder_parts(size_t n_part)
 	// Here we test fill cell
 	CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
 			      decltype(pl.toKernel()),
-				  decltype(sort_to_not_sort.toKernel()),
-				  decltype(cells_out.toKernel()),
+			      decltype(sort_to_not_sort.toKernel()),
 			      cnt_type,
 			      shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
 			                                                  parts_prp.toKernel(),
@@ -673,7 +675,7 @@ void test_reorder_parts(size_t n_part)
 			                                                  pl_out.toKernel(),
 			                                                  sort_to_not_sort.toKernel(),
 			                                                  non_sort_to_sort.toKernel(),
-			                                                  cells_out.toKernel());
+			                                                  static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
 
 	bool check = true;
 	parts_prp_out.template deviceToHost<0>();
diff --git a/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh b/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh
index 3981ce07..fc108d88 100644
--- a/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh
+++ b/src/NN/CellList/cuda/CellDecomposer_gpu_ker.cuh
@@ -29,69 +29,28 @@ class CellDecomposer_gpu_ker
 	//! transformation
 	transform t;
 
-	//! Unit box of the Cell list
-	SpaceBox<dim,T> box_unit;
-
-	//! Grid structure of the Cell list
-	grid_sm<dim,void> gr_cell;
-
-	//! cell_shift
-	Point<dim,long int> cell_shift;
 public:
 
 	__device__ __host__ CellDecomposer_gpu_ker()
 	{}
 
-	__device__ __host__ CellDecomposer_gpu_ker(
-		openfpm::array<T,dim,cnt_type> & spacing_c,
-		openfpm::array<ids_type,dim,cnt_type> & div_c,
-		openfpm::array<ids_type,dim,cnt_type> & off,
-		const transform & t)
-	: spacing_c(spacing_c),div_c(div_c),off(off),t(t)
+	__device__ __host__ CellDecomposer_gpu_ker(openfpm::array<T,dim,cnt_type> & spacing_c,
+	         	 	 	  openfpm::array<ids_type,dim,cnt_type> & div_c,
+	         	 	 	  openfpm::array<ids_type,dim,cnt_type> & off,
+	         	 	 	  const transform & t)
+	:spacing_c(spacing_c),div_c(div_c),off(off),t(t)
 	{}
 
-	__device__ __host__ CellDecomposer_gpu_ker(
-		openfpm::array<T,dim,cnt_type> & spacing_c,
-		openfpm::array<ids_type,dim,cnt_type> & div_c,
-		openfpm::array<ids_type,dim,cnt_type> & off,
-		const transform & t,
-		SpaceBox<dim,T> box_unit,
-		grid_sm<dim,void> gr_cell,
-		Point<dim,long int> cell_shift)
-	: spacing_c(spacing_c),div_c(div_c),off(off),t(t),
-	box_unit(box_unit), gr_cell(gr_cell), cell_shift(cell_shift)
-	{}
-
-	__device__ __host__ grid_sm<dim,void> getGrid()
+	__host__ grid_sm<dim,void> getGrid()
 	{
 		size_t sz[dim];
 
-		for (size_t i = 0 ; i < dim ; i++)
-		{
-			sz[i] = div_c[i];
-		}
-
-		return grid_sm<dim,void> (sz);
-	}
-
-	__device__ __host__ void getGridSize(size_t (& sz)[dim]) const
-	{
 		for (size_t i = 0 ; i < dim ; i++)
 		{
 			sz[i] = div_c[i] + 2*off[i];
 		}
-	}
 
-	template<typename ids_type2>
-	__device__ __host__ mem_id getGridLinId(const grid_key_dx<dim,ids_type2> & gk) const
-	{
-		mem_id lid = gk.get(0);
-		for (mem_id i = 1 ; i < dim ; i++)
-		{
-			lid += gk.get(i) * (div_c[i-1] + 2*off[i-1]);
-		}
-
-		return lid;
+		return grid_sm<dim,void> (sz);
 	}
 
 	__device__ __host__ inline grid_key_dx<dim,ids_type> getCell(const Point<dim,T> & xp) const
@@ -123,46 +82,6 @@ public:
 	{
 		return t;
 	}
-
-	__device__ __host__ inline grid_key_dx<dim> getCellGrid(const T (& pos)[dim]) const
-	{
-		grid_key_dx<dim> key;
-		key.set_d(0,ConvertToID(pos,0));
-
-		for (size_t s = 1 ; s < dim ; s++)
-		{
-			key.set_d(s,ConvertToID(pos,s));
-		}
-
-		return key;
-	}
-
-	__device__ __host__ inline grid_key_dx<dim> getCellGrid(const Point<dim,T> & pos) const
-	{
-		grid_key_dx<dim> key;
-		key.set_d(0,ConvertToID(pos,0));
-
-		for (size_t s = 1 ; s < dim ; s++)
-		{
-			key.set_d(s,ConvertToID(pos,s));
-		}
-
-		return key;
-	}
-
-	__device__ __host__ inline size_t ConvertToID(const T (&x)[dim] ,size_t s) const
-	{
-		size_t id = openfpm::math::size_t_floor(t.transform(x,s) / box_unit.getHigh(s)) + off[s];
-		id = (id >= gr_cell.size(s))?(gr_cell.size(s)-1-cell_shift.get(s)):id-cell_shift.get(s);
-		return id;
-	}
-
-	__device__ __host__ inline size_t ConvertToID(const Point<dim,T> & x ,size_t s, size_t sc = 0) const
-	{
-		size_t id = openfpm::math::size_t_floor(t.transform(x,s) / box_unit.getHigh(s)) + off[s];
-		id = (id >= gr_cell.size(s))?(gr_cell.size(s)-1-cell_shift.get(s)):id-cell_shift.get(s);
-		return id;
-	}
 };
 
 #endif /* CELLDECOMPOSER_GPU_KER_HPP_ */
diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp
index 5ee0e2b0..c7623583 100644
--- a/src/NN/CellList/cuda/CellList_gpu.hpp
+++ b/src/NN/CellList/cuda/CellList_gpu.hpp
@@ -44,10 +44,7 @@ struct CellList_gpu_ker_selector
 																			 openfpm::array<ids_type,dim,cnt_type> & div_c,
 																			 openfpm::array<ids_type,dim,cnt_type> & off,
 																			 const transform & t,
-																			 unsigned int g_m,
-																			 const SpaceBox<dim,T>& box_unit,
-																			 const grid_sm<dim,void>& gr_cell,
-																			 const Point<dim,long int>& cell_shift)
+																			 unsigned int g_m)
 	{
 		return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse>(starts.toKernel(),
 																			sorted_to_not_sorted.toKernel(),
@@ -57,10 +54,7 @@ struct CellList_gpu_ker_selector
 																			div_c,
 																			off,
 																			t,
-																			g_m,
-																			box_unit,
-																			gr_cell,
-																			cell_shift);
+																			g_m);
 	}
 };
 
@@ -79,14 +73,10 @@ struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector
 			 vector_cnt_type & dprt,
 			 openfpm::vector<aggregate<int>,Memory,memory_traits_inte> & nnc_rad,
 			 openfpm::array<T,dim,cnt_type> & spacing_c,
-			 openfpm::array<ids_type,dim,cnt_type> & div_c,
-			 openfpm::array<ids_type,dim,cnt_type> & off,
-			 const transform & t,
-			 unsigned int g_m,
-			 const SpaceBox<dim,T>& box_unit,
-			 const grid_sm<dim,void>& gr_cell,
-			 const Point<dim,long int>& cell_shift)
-
+	         openfpm::array<ids_type,dim,cnt_type> & div_c,
+	         openfpm::array<ids_type,dim,cnt_type> & off,
+	         const transform & t,
+	         unsigned int g_m)
 	{
 		return CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,true>(cell_nn.toKernel(),
 																		cell_nn_list.toKernel(),
@@ -96,11 +86,7 @@ struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector
 																		spacing_c,
 																		div_c,
 																		off,
-																		t,
-																		g_m,
-																		box_unit,
-																		gr_cell,
-																		cell_shift);
+																		t,g_m);
 	}
 };
 
@@ -261,11 +247,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																		spacing_c,
 																		off,
 																		this->getTransform(),
+																		pl.capacity(),
 																		pl.size(),
+																		part_ids.capacity(),
 																		start,
-																		pl.toKernel(),
-																		starts.toKernel(),
-																		part_ids.toKernel());
+																		static_cast<T *>(pl.template getDeviceBuffer<0>()),
+																		static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
+																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
 
 		// now we construct the cells
 
@@ -304,7 +292,6 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 		CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 				      decltype(pl.toKernel()),
 				      decltype(sorted_to_not_sorted.toKernel()),
-					  decltype(cells.toKernel()),
 				      cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
 				                                                           pl_prp.toKernel(),
 				                                                           pl_prp_out.toKernel(),
@@ -312,7 +299,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 				                                                           pl_out.toKernel(),
 				                                                           sorted_to_not_sorted.toKernel(),
 				                                                           non_sorted_to_sorted.toKernel(),
-				                                                           cells.toKernel());
+				                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
 
 		if (opt == cl_construct_opt::Full)
 		{
@@ -390,11 +377,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																		spacing_c,
 																		off,
 																		this->getTransform(),
+																		pl.capacity(),
 																		stop,
+																		part_ids.capacity(),
 																		start,
-																		pl.toKernel(),
-																		cl_n.toKernel(),
-																		part_ids.toKernel());
+																		static_cast<T *>(pl.template getDeviceBuffer<0>()),
+																		static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
+																		static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
 
 		// now we scan
 		starts.resize(cl_n.size());
@@ -410,7 +399,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
                                                                                             part_ids.size(),
-                                                                                            cells.toKernel()) );
+                                                                                            static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
 
                 // sort
 
@@ -419,13 +408,14 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 #else
 
                 CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
-                                                                                    div_c,
-                                                                                    off,
-																					part_ids.size(),
-																					start,
-                                                                                    starts.toKernel(),
-																					part_ids.toKernel(),
-																					cells.toKernel() );
+                                                                                                                                                                           div_c,
+                                                                                                                                                                           off,
+                                                                                                                                                                           part_ids.size(),
+                                                                                                                                                                           part_ids.capacity(),
+																					   start,
+                                                                                                                                                                           static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
+                                                                                                                                                                           static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
+                                                                                                                                                                           static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
 
 #endif
 
@@ -441,7 +431,6 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 						  decltype(pl.toKernel()),
 						  decltype(sorted_to_not_sorted.toKernel()),
-						  decltype(cells.toKernel()),
 						  cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
 																			   pl_prp.toKernel(),
 																			   pl_prp_out.toKernel(),
@@ -449,7 +438,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																			   pl_out.toKernel(),
 																			   sorted_to_not_sorted.toKernel(),
 																			   non_sorted_to_sorted.toKernel(),
-																			   cells.toKernel());
+																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
 		}
 		else
 		{
@@ -457,7 +446,6 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()),
 						  decltype(pl.toKernel()),
 						  decltype(sorted_to_not_sorted.toKernel()),
-						  decltype(cells.toKernel()),
 						  cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(),
 																			   pl_prp.toKernel(),
 																			   pl_prp_out.toKernel(),
@@ -465,7 +453,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 																			   pl_out.toKernel(),
 																			   sorted_to_not_sorted.toKernel(),
 																			   non_sorted_to_sorted.toKernel(),
-																			   cells.toKernel());
+																			   static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
 		}
 
 		if (opt == cl_construct_opt::Full)
@@ -660,17 +648,14 @@ public:
 				cells_nn,
 				cells_nn_list,
 				cl_sparse,
-				sorted_to_not_sorted,
-				sorted_domain_particles_ids,
-				nnc_rad,
-				spacing_c,
-				div_c,
-				off,
-				this->getTransform(),
-				g_m,
-				this->box_unit,
-				this->gr_cell,
-				this->cell_shift);
+		    	sorted_to_not_sorted,
+		    	sorted_domain_particles_ids,
+		    	nnc_rad,
+		        spacing_c,
+		        div_c,
+		        off,
+		        this->getTransform(),
+		        g_m);
 	}
 
 	/*! \brief Clear the structure
diff --git a/src/NN/CellList/cuda/CellList_gpu_ker.cuh b/src/NN/CellList/cuda/CellList_gpu_ker.cuh
index becb50c5..004716a7 100644
--- a/src/NN/CellList/cuda/CellList_gpu_ker.cuh
+++ b/src/NN/CellList/cuda/CellList_gpu_ker.cuh
@@ -449,14 +449,11 @@ public:
 					 openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt,
 					 openfpm::vector_gpu_ker<aggregate<int>,memory_traits_inte> rad_cells,
 					 openfpm::array<T,dim,cnt_type> & spacing_c,
-					 openfpm::array<ids_type,dim,cnt_type> & div_c,
-					 openfpm::array<ids_type,dim,cnt_type> & off,
-					 const transform & t,
-					 unsigned int g_m,
-					 SpaceBox<dim,T> box_unit,
-					 grid_sm<dim,void> gr_cell,
-					 Point<dim,long int> cell_shift)
-	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t,box_unit,gr_cell,cell_shift),
+			         openfpm::array<ids_type,dim,cnt_type> & div_c,
+			         openfpm::array<ids_type,dim,cnt_type> & off,
+			         const transform & t,
+			         unsigned int g_m)
+	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t),
 	 starts(starts),srt(srt),dprt(dprt),rad_cells(rad_cells),g_m(g_m)
 	{
 	}
@@ -619,16 +616,12 @@ public:
 					 openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> srt,
 					 openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt,
 					 openfpm::array<T,dim,cnt_type> & spacing_c,
-					 openfpm::array<ids_type,dim,cnt_type> & div_c,
-					 openfpm::array<ids_type,dim,cnt_type> & off,
-					 const transform & t,
-					 unsigned int g_m,
-					 SpaceBox<dim,T> box_unit,
-					 grid_sm<dim,void> gr_cell,
-					 Point<dim,long int> cell_shift)
-
-	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t,box_unit,gr_cell,cell_shift),
-	cell_nn(cell_nn),cell_nn_list(cell_nn_list),srt(srt),dprt(dprt),cl_sparse(cl_sparse),g_m(g_m)
+			         openfpm::array<ids_type,dim,cnt_type> & div_c,
+			         openfpm::array<ids_type,dim,cnt_type> & off,
+			         const transform & t,
+			         unsigned int g_m)
+	:CellDecomposer_gpu_ker<dim,T,cnt_type,ids_type,transform>(spacing_c,div_c,off,t),cell_nn(cell_nn),cell_nn_list(cell_nn_list),srt(srt),dprt(dprt),
+	 cl_sparse(cl_sparse),g_m(g_m)
 	{
 	}
 
diff --git a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
index e50335e7..0d7c86a9 100644
--- a/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
+++ b/src/NN/CellList/cuda/Cuda_cell_list_util_func.hpp
@@ -258,17 +258,18 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid)
 #ifdef __NVCC__
 
 template<bool is_sparse,unsigned int dim, typename pos_type,
-         typename cnt_type, typename ids_type, typename transform,
-		 typename vector_pos_type, typename vector_cnt_type, typename vector_pids_type>
+         typename cnt_type, typename ids_type, typename transform>
 __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
 						 openfpm::array<pos_type,dim,cnt_type> spacing,
 						 openfpm::array<ids_type,dim,cnt_type> off,
 						 transform t,
+						 int n_cap,
 						 int n_part,
+						 int n_cap2,
 						 cnt_type start,
-						 vector_pos_type p_pos,
-						 vector_cnt_type counts,
-						 vector_pids_type p_ids)
+						 pos_type * p_pos,
+						 cnt_type *counts,
+						 cnt_type * p_ids)
 {
     cnt_type i, cid, ins;
     ids_type e[dim+1];
@@ -280,23 +281,23 @@ __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
     pos_type p[dim];
 
     for (size_t k = 0 ; k < dim ; k++)
-    {p[k] = p_pos.template get<0>(i)[k];}
+    {p[k] = p_pos[i+k*n_cap];}
 
     cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e);
 
     if (is_sparse == false)
     {
-    	e[dim] = atomicAdd(&counts.template get<0>(cid), 1);
+    	e[dim] = atomicAdd(counts + cid, 1);
 
-    	p_ids.template get<0>(ins)[0] = cid;
-        {p_ids.template get<0>(ins)[1] = e[dim];}
+    	p_ids[ins] = cid;
+        {p_ids[ins+1*(n_cap2)] = e[dim];}
     }
     else
     {
-        p_ids.template get<0>(ins)[0] = cid;
-        {p_ids.template get<0>(ins)[1] = e[dim];}
+        p_ids[ins] = cid;
+        {p_ids[ins+1*(n_cap2)] = e[dim];}
 
-        counts.template get<0>(ins) = cid;
+        counts[ins] = cid;
     }
 }
 
@@ -352,42 +353,43 @@ __global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div
 
 #ifdef MAKE_CELLLIST_DETERMINISTIC
 
-template<unsigned int dim, typename cnt_type, typename ids_type, typename ph, typename vector_cells_type>
+template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
 __global__ void fill_cells(cnt_type phase_id ,
 						   cnt_type n,
-		                   vector_cells_type cells)
+		                   cnt_type *cells)
 {
     cnt_type i;
 
     i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cells.template get<0>(i) = encode_phase_id<cnt_type,ph>(phase_id,i);
+    cells[i] = encode_phase_id<cnt_type,ph>(phase_id,i);
 }
 
 #else
 
-template<unsigned int dim, typename cnt_type, typename ids_type, typename ph,typename vector_starts_type, typename vector_pids_type, typename vector_cells_type>
+template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
 __global__ void fill_cells(cnt_type phase_id ,
 		                   openfpm::array<ids_type,dim,cnt_type> div_c,
 		                   openfpm::array<ids_type,dim,cnt_type> off,
 		                   cnt_type n,
+		                   cnt_type n_cap,
 		                   cnt_type start_p,
-		                   vector_starts_type starts,
-		                   vector_pids_type p_ids,
-		                   vector_cells_type cells)
+		                   const cnt_type *starts,
+		                   const cnt_type * p_ids,
+		                   cnt_type *cells)
 {
     cnt_type i, cid, id, start;
 
     i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cid = p_ids.template get<0>(i)[0];
+    cid = p_ids[i];
 
-    start = starts.template get<0>(cid);
-    id = start + p_ids.template get<0>(i)[1];
+    start = starts[cid];
+    id = start + p_ids[1*n_cap+i];
 
-    cells.template get<0>(id) = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
+    cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
 }
 
 #endif
@@ -422,7 +424,7 @@ __device__ inline void reorder_wprp(const vector_prp & input,
 	output.template set<prp ...>(dst_id,input,src_id);
 }
 
-template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh>
+template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh>
 __global__ void reorder_parts(int n,
 		                      const vector_prp input,
 		                      vector_prp output,
@@ -430,12 +432,12 @@ __global__ void reorder_parts(int n,
 		                      vector_pos output_pos,
 		                      vector_ns sorted_non_sorted,
 		                      vector_ns non_sorted_to_sorted,
-		                      const vector_cells_type cells)
+		                      const cnt_type * cells)
 {
     cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cnt_type code = cells.template get<0>(i);
+    cnt_type code = cells[i];
 
     reorder(input, output, code,i);
     reorder(input_pos,output_pos,code,i);
@@ -444,7 +446,7 @@ __global__ void reorder_parts(int n,
     non_sorted_to_sorted.template get<0>(code) = i;
 }
 
-template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh, unsigned int ... prp>
+template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh, unsigned int ... prp>
 __global__ void reorder_parts_wprp(int n,
 		                      const vector_prp input,
 		                      vector_prp output,
@@ -452,12 +454,12 @@ __global__ void reorder_parts_wprp(int n,
 		                      vector_pos output_pos,
 		                      vector_ns sorted_non_sorted,
 		                      vector_ns non_sorted_to_sorted,
-		                      const vector_cells_type cells)
+		                      const cnt_type * cells)
 {
     cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
     if (i >= n) return;
 
-    cnt_type code = cells.template get<0>(i);
+    cnt_type code = cells[i];
     reorder_wprp<vector_prp,cnt_type,prp...>(input, output, code,i);
     reorder(input_pos,output_pos,code,i);
 
diff --git a/src/Space/Shape/Point.hpp b/src/Space/Shape/Point.hpp
index 6e49d0e9..91f62252 100644
--- a/src/Space/Shape/Point.hpp
+++ b/src/Space/Shape/Point.hpp
@@ -416,7 +416,7 @@ template<unsigned int dim ,typename T> class Point
 	 * \return the reference
 	 *
 	 */
-	__device__ __host__ T & value(size_t i)
+	T & value(size_t i)
 	{
 		return get(i);
 	}
diff --git a/src/SparseGridGpu/performance/performancePlots.cpp b/src/SparseGridGpu/performance/performancePlots.cpp
index 3cd3b0e4..01a70924 100644
--- a/src/SparseGridGpu/performance/performancePlots.cpp
+++ b/src/SparseGridGpu/performance/performancePlots.cpp
@@ -45,8 +45,6 @@ void write_test_report(report_sparse_grid_tests &report_sparsegrid_funcs, std::s
     plotGetSingle2D(report_sparsegrid_funcs, testSet, plotCounter);
     plotGetNeighbourhood2D(report_sparsegrid_funcs, testSet, plotCounter);
 
-
-
     std::string file_xml_results(test_dir);
     file_xml_results += std::string("/") + std::string(perfResultsXmlFile);
 
@@ -312,7 +310,7 @@ void plotDense3D(report_sparse_grid_tests &report_sparsegrid_funcs, std::set<std
                                            base + ".gridScaling(#).GFlops.mean");
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").x.data(0).source",
                                            base + ".gridScaling(#).gridSize.x");
-        report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x", true);
+   report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x", true);
         int bes = static_cast<int>( report_sparsegrid_funcs.graphs.template get<double>(
                 base + ".gridScaling(0).blockSize"));
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").y.data(0).title",
@@ -332,7 +330,7 @@ void plotDense3D(report_sparse_grid_tests &report_sparsegrid_funcs, std::set<std
                                            base + ".blockScaling(#).GFlops.mean");
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").x.data(0).source",
                                            base + ".blockScaling(#).blockSize");
-        report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x",true);
+   report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").options.log_x",true);
         int ges = static_cast<int>( report_sparsegrid_funcs.graphs.template get<double>(
                 base + ".blockScaling(0).gridSize.x"));
         report_sparsegrid_funcs.graphs.add("graphs.graph(" + std::to_string(plotCounter) + ").y.data(0).title",
diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh
index fef74515..ea202a7f 100644
--- a/src/Vector/cuda/map_vector_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_cuda_ker.cuh
@@ -172,11 +172,6 @@ namespace openfpm
 			return v_size;
 		}
 
-        __host__ __device__ size_t size_local() const
-        {
-            return size();
-        }
-
 		/*! \brief return the maximum capacity of the vector before reallocation
 		 *
 		 * \return the capacity of the vector
@@ -209,38 +204,6 @@ namespace openfpm
 
 			return base.template get<p>(key);
 		}
-        /*! \brief Get an element of the vector
-		 *
-		 * Get an element of the vector
-		 *
-		 * \tparam p Property to get
-		 * \param id Element to get
-		 *
-		 * \return the element value requested
-		 *
-		 */
-        template <unsigned int p>
-        __device__ __host__ inline auto getProp(unsigned int id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
-        {
-            return this->get<p>(id);
-        }
-
-
-        /*! \brief Get an element of the vector
-		 *
-		 * Get an element of the vector
-		 *
-		 * \tparam p Property to get
-		 * \param id Element to get
-		 *
-		 * \return the element value requested
-		 *
-		 */
-        template <unsigned int p, typename key_type>
-        __device__ __host__ inline auto getProp(key_type id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
-        {
-            return this->get<p>(id.getKey());
-        }
 
 		/*! \brief Get an element of the vector
 		 *
@@ -525,22 +488,7 @@ namespace openfpm
 
 			return base.getGPUIterator(start,stop,n_thr);
 		}
-        /*! \brief Get a domain iterator for the GPU
-         *
-         *
-         */
-        ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
-        {
-            return getGPUIterator(n_thr);
-        }
-
-        //Stub for some expression
-        void init() const {}
-
-        __host__ __device__ auto value(unsigned int p) -> decltype(base.template get<0>(grid_key_dx<1>(0)))
-        {
-            return get<0>(p);
-        }
+
 		/*! \brief Get an iterator for the GPU
 		 *
 		 *
@@ -553,7 +501,6 @@ namespace openfpm
 			return base.getGPUIterator(start,stop_,n_thr);
 		}
 
-
 		/*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers
 		 *
 		 * \param object to copy
@@ -567,16 +514,6 @@ namespace openfpm
 			return *this;
 		}
 
-        __device__ __host__ vector_gpu_ker<T,layout_base> & getVector()
-        {
-            return *this;
-        }
-
-        __device__ __host__ const vector_gpu_ker<T,layout_base> & getVector() const
-        {
-            return *this;
-        }
-
 		/*! \brief Return the base
 		 *
 		 * \return the base
@@ -659,11 +596,6 @@ namespace openfpm
 			return vref.size();
 		}
 
-        __host__ __device__ size_t size_local() const
-        {
-            return size();
-        }
-
 		__device__ __host__ unsigned int capacity() const
 		{
 			return vref.capacity;
@@ -760,16 +692,6 @@ namespace openfpm
 			return vref.getGPUItertatorTo(stop,n_thr);
 		}
 
-        vector_gpu_ker<T,layout_base> & getVector()
-        {
-         return *this;
-        }
-
-        const vector_gpu_ker<T,layout_base> & getVector() const
-        {
-            return *this;
-        }
-
 		__host__ vector_gpu_ker_ref<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
 		{
 			vref.operator=(v);
diff --git a/src/Vector/cuda/map_vector_std_cuda.hpp b/src/Vector/cuda/map_vector_std_cuda.hpp
index d384a51f..319add7b 100644
--- a/src/Vector/cuda/map_vector_std_cuda.hpp
+++ b/src/Vector/cuda/map_vector_std_cuda.hpp
@@ -277,11 +277,8 @@ public:
 	 *
 	 */
 	vector(const std::initializer_list<T> & v)
-	// :base(v)
+	:base(v)
 	{
-		// causes error if simply passed to base
-		for (T t: v)
-			add(t);
 	}
 
 	//! Constructor from another vector
@@ -291,16 +288,6 @@ public:
 		base.swap(v.base);
 	}
 
-	//! Constructor from iterators
-	template< class InputIt >
-	vector(InputIt first, InputIt last)
-	{
-		while(first != last) {
-			add(*first);
-			++first;
-		}
-	}
-
 	//! destructor
 	~vector() noexcept
 	{
diff --git a/src/Vector/cuda/map_vector_std_cuda_ker.cuh b/src/Vector/cuda/map_vector_std_cuda_ker.cuh
index ea583826..cb1b054f 100644
--- a/src/Vector/cuda/map_vector_std_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_std_cuda_ker.cuh
@@ -126,27 +126,6 @@ public:
 		return base.template get<0>(id);
 	}
 
-	/*! \brief Return the pointer to the chunk of memory
-	 *
-	 * \return the pointer to the chunk of memory
-	 *
-	 */
-	__device__ __host__ void * getPointer()
-	{
-		return &base.template get<0>(0);
-	}
-
-	/*! \brief Return the pointer to the chunk of memory
-	 *
-	 * \return the pointer to the chunk of memory
-	 *
-	 */
-	__device__ __host__ const void * getPointer() const
-	{
-		return &base.template get<0>(0);
-	}
-
-
 	vector_custd_ker()
 	{}
 
diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp
index 078ffd17..953fdeab 100644
--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -336,7 +336,7 @@ namespace openfpm
         *
         * \return the size
         *
-        */ //remove host device
+        */
         size_t size_local() const
         {
             return v_size;
@@ -1359,21 +1359,7 @@ namespace openfpm
 
 			return base.get_o(key);
 		}
-        /*! \brief Get an element of the vector
-         *
-         * Get an element of the vector
-         *
-         * \tparam p Property to get
-         * \param id Element to get
-         *
-         * \return the element value requested
-         *
-         */ //remove device host
-        template <unsigned int p>
-        inline auto getProp(const unsigned int & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
-        {   //uncomment this
-            return this->template get<p>(id);
-        }
+
 		/*! \brief Get an element of the vector
 		 *
 		 * Get an element of the vector
@@ -1383,12 +1369,12 @@ namespace openfpm
 		 *
 		 * \return the element value requested
 		 *
-		 *///remove host device
-        template <unsigned int p,typename KeyType>
-        inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		 */
+
+		template <unsigned int p,typename KeyType>
+		inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
 		{
-            //uncomment this
-            return this->template get<p>(id.getKey());
+			return this->template get<p>(id.getKey());
 		}
 
 		/*! \brief Get an element of the vector
@@ -1400,10 +1386,10 @@ namespace openfpm
 		 *
 		 * \return the element value requested
 		 *
-		 */ //remove device host
+		 */
 		template <unsigned int p, typename keyType>
-         inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
-		{   //uncomment this
+		inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		{
 			return this->template get<p>(id.getKey());
 		}
 
@@ -1944,7 +1930,6 @@ namespace openfpm
 			return base.getGPUIterator(start,stop_,n_thr);
 		}
 
-
 #endif
 
 		/*! \brief Get the vector elements iterator
@@ -2002,14 +1987,6 @@ namespace openfpm
 
 			return base.getGPUIterator(start,stop,n_thr);
 		}
-        /*! \brief Get a domain iterator for the GPU
-         *
-         *
-         */
-        ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
-        {
-            return getGPUIterator(n_thr);
-        }
 
 #endif
 		/*! \brief Return the size of the message needed to pack this object
diff --git a/src/Vector/map_vector_std.hpp b/src/Vector/map_vector_std.hpp
index ae0c08f1..2cf717e2 100644
--- a/src/Vector/map_vector_std.hpp
+++ b/src/Vector/map_vector_std.hpp
@@ -644,16 +644,6 @@ public:
 		base.swap(v.base);
 	}
 
-	//! Constructor from iterators
-	template< class InputIt >
-	vector(InputIt first, InputIt last)
-	{
-		while(first != last) {
-			add(*first);
-			++first;
-		}
-	}
-
 	//! destructor
 	~vector() noexcept
 	{
diff --git a/src/Vector/vector_subset.hpp b/src/Vector/vector_subset.hpp
deleted file mode 100644
index d6d78b69..00000000
--- a/src/Vector/vector_subset.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef VECTOR_SUBSET_HPP
-#define VECTOR_SUBSET_HPP
-
-namespace openfpm
-{
-
-template<unsigned int dim,
-         typename prop,
-         template<typename> class layout_base = memory_traits_inte>
-    class vector_subset_ker
-    {
-        mutable openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> v_all;
-
-        mutable openfpm::vector_gpu_ker<aggregate<int>,layout_base> indexes;
-
-        public:
-
-        vector_subset_ker(openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> & v_all,
-                          openfpm::vector_gpu_ker<aggregate<int>,layout_base> & indexes)
-        :v_all(v_all),indexes(indexes)
-        {}
-
-        // get the
-
-		template <unsigned int p>
-		__device__ __host__ inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
-		{
-            return v_all.template get<p>(indexes.template get<0>(id));
-        }
-    }
-
-public:
-
-    template<typename T, typename Memory, template<typename> class layout_base, typename grow_p, unsigned int impl>
-    class vector_sub
-    {
-        vector<T,Memory,layout_base,grow_p,impl> & v_all;
-
-        vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes;
-
-        public:
-
-            vector(vector<T,Memory,layout_base,grow_p,impl> & v_all, 
-                   vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes)
-            :v_all(v_all),indexes(indexes)
-            {}
-
-
-            // To kernel
-            template<unsigned int ... prp> vector_dist_ker<dim,St,prop,layout_base> toKernel()
-            {
-                vector_subset_ker<dim,St,prop,layout_base> v(v_all.toKernel(), indexes.toKernel());
-
-                return v;
-            }
-
-		    template <unsigned int p>
-		    inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
-		    {
-                return v_all.template get<p>(indexes.template get<0>(id));
-            }
-
-    }
-
-};
-
-#endif
\ No newline at end of file
diff --git a/src/util/mathutil.hpp b/src/util/mathutil.hpp
index e813f30b..1b05785a 100644
--- a/src/util/mathutil.hpp
+++ b/src/util/mathutil.hpp
@@ -19,7 +19,7 @@ namespace openfpm
 		 *
 		 */
 		template <typename T>
-		__host__ __device__ int sgn(T val)
+		int sgn(T val)
 		{
 			return (T(0) < val) - (val < T(0));
 		}
@@ -35,8 +35,7 @@ namespace openfpm
 		 * \return the factorial
 		 *
 		 */
-		__host__ __device__ static inline
-		size_t factorial(size_t f)
+		static inline size_t factorial(size_t f)
 		{
 			size_t fc = 1;
 
@@ -61,8 +60,7 @@ namespace openfpm
 		 * \param k
 		 *
 		 */
-		__host__ __device__ static inline
-		size_t C(size_t n, size_t k)
+		static inline size_t C(size_t n, size_t k)
 		{
 			return factorial(n)/(factorial(k)*factorial(n-k));
 		}
@@ -77,8 +75,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		__host__ __device__ static inline
-		size_t round_big_2(size_t n)
+		static inline size_t round_big_2(size_t n)
 		{
 			n--;
 			n |= n >> 1;   // Divide by 2^k for consecutive doublings of k up to 32,
@@ -105,15 +102,14 @@ namespace openfpm
 		 */
 
 		template<class T>
-		__host__ __device__ inline constexpr
-		size_t pow(const T base, unsigned const exponent)
+		inline constexpr size_t pow(const T base, unsigned const exponent)
 		{
 			// (parentheses not required in next line)
 			return (exponent == 0) ? 1 : (base * pow(base, exponent-1));
 		}
 
         template<class T>
-        __host__ __device__ double intpowlog(const T x, unsigned const e)
+        double intpowlog(const T x, unsigned const e)
         {
             if (e == 0) return 1.0;
             if (e % 2 == 0)
@@ -139,8 +135,7 @@ namespace openfpm
 		 * \param n modulo
 		 *
 		 */
-		__host__ __device__ static inline
-		long int positive_modulo(long int i, long int n)
+		static inline long int positive_modulo(long int i, long int n)
 		{
 		    return (i % n + n) % n;
 		}
@@ -160,9 +155,7 @@ namespace openfpm
 		 * \return the bound number
 		 *
 		 */
-		template<typename T>
-		__host__ __device__ static inline
-		T periodic(const T & pos, const T & p2, const T & p1)
+		template<typename T> static inline T periodic(const T & pos, const T & p2, const T & p1)
 		{
 			T pos_tmp;
 
@@ -189,9 +182,7 @@ namespace openfpm
 		 * \return the bound number
 		 *
 		 */
-		template<typename T>
-		__device__ __host__ static inline
-		T periodic_l(const T & pos, const T & p2, const T & p1)
+		template<typename T> __device__ __host__ static inline T periodic_l(const T & pos, const T & p2, const T & p1)
 		{
 			T pos_tmp = pos;
 
@@ -216,7 +207,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		__host__ __device__ inline long int size_t_floor(double x)
+		inline long int size_t_floor(double x)
 		{
 		  size_t i = (long int)x; /* truncate */
 		  return i - ( i > x ); /* convert trunc to floor */
@@ -226,7 +217,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		__host__ __device__ inline long int size_t_floor(float x)
+		inline long int size_t_floor(float x)
 		{
 		  size_t i = (long int)x; /* truncate */
 		  return i - ( i > x ); /* convert trunc to floor */
@@ -267,7 +258,7 @@ namespace openfpm
 		 * \param value
 		 *
 		 */
-		__host__ __device__ inline int log2_64 (uint64_t value)
+		inline int log2_64 (uint64_t value)
 		{
 		    value |= value >> 1;
 		    value |= value >> 2;
@@ -285,7 +276,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		__host__ __device__ inline long int size_t_floor(boost::multiprecision::float128 x)
+		inline long int size_t_floor(boost::multiprecision::float128 x)
 		{
 		  size_t i = (long int)x; /* truncate */
 		  return i - ( i > x ); /* convert trunc to floor */
-- 
GitLab