From 5e99cd3b07f8745c0e3d9228211be49f2f34b754 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Tue, 5 Jun 2018 01:57:22 +0200
Subject: [PATCH] Bringing up GPU grid computation

---
 src/Grid/gpu_test/cuda_gpu_func.cpp        |  97 ++++++++
 src/Grid/gpu_test/cuda_grid_unit_tests.cu  |  95 ++++++++
 src/Grid/gpu_test/cuda_grid_unit_tests.cuh |  19 ++
 src/Grid/grid_base_impl_layout.hpp         |   5 -
 src/Grid/grid_base_implementation.hpp      |  79 ++++++-
 src/Grid/grid_gpu.hpp                      | 190 ---------------
 src/Grid/grid_key.hpp                      |  23 +-
 src/Grid/grid_sm.hpp                       |  10 +-
 src/Grid/grid_unit_tests.hpp               | 162 ++++++++++++-
 src/Grid/map_grid.hpp                      | 261 ++++++++++++++++++++-
 src/Makefile.am                            |   6 +-
 src/Vector/map_vector.hpp                  |   7 +-
 src/memory_ly/memory_array.hpp             |  10 +
 src/memory_ly/memory_c.hpp                 | 114 ++++++++-
 src/memory_ly/memory_conf.hpp              |   4 +-
 src/util/boost_multi_array_openfpm.hpp     | 114 ++++++++-
 src/util/ct_array.hpp                      |   6 +-
 17 files changed, 975 insertions(+), 227 deletions(-)
 create mode 100644 src/Grid/gpu_test/cuda_gpu_func.cpp
 create mode 100644 src/Grid/gpu_test/cuda_grid_unit_tests.cu
 create mode 100644 src/Grid/gpu_test/cuda_grid_unit_tests.cuh
 delete mode 100644 src/Grid/grid_gpu.hpp

diff --git a/src/Grid/gpu_test/cuda_gpu_func.cpp b/src/Grid/gpu_test/cuda_gpu_func.cpp
new file mode 100644
index 00000000..6c89ffed
--- /dev/null
+++ b/src/Grid/gpu_test/cuda_gpu_func.cpp
@@ -0,0 +1,97 @@
+/*
+ * cuda_gpu_func.cpp
+ *
+ *  Created on: Jun 3, 2018
+ *      Author: i-bird
+ */
+
+#include "config.h"
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+#include "Grid/map_grid.hpp"
+#include "Point_test.hpp"
+
+BOOST_AUTO_TEST_SUITE( grid_gpu_func_test )
+
+
+BOOST_AUTO_TEST_CASE (gpu_computation_func)
+{
+#ifdef CUDA_GPU
+
+	size_t sz[3] = {64,64,64};
+	grid_gpu<3, Point_test<float> > c3(sz);
+
+	grid_key_dx<3> k1({1,1,1});
+	grid_key_dx<3> k2({62,62,62});
+
+	c3.setMemory();
+
+	auto gcf = c3.getGPUIterator(k1,k2);
+
+	BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul);
+	BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul);
+
+	BOOST_REQUIRE_EQUAL(gcf.wthr.x,4ul);
+	BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul);
+
+	grid_key_dx<3> k3({50,50,50});
+	grid_key_dx<3> k4({62,62,62});
+	grid_key_dx<3> k5({60,61,62});
+
+	auto gcf2 = c3.getGPUIterator(k3,k4);
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,16ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,1ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul);
+
+	gcf2 = c3.getGPUIterator(k3,k4,511);
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul);
+
+	gcf2 = c3.getGPUIterator(k3,k4,1);
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,1ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,1ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,1ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,13ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,13ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,13ul);
+
+	gcf2 = c3.getGPUIterator(k3,k5,32);
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,4ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,4ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,2ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,3ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,3ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,7ul);
+
+	gcf2 = c3.getGPUIterator(k3,k5,1);
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,1ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,1ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,1ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,11ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,12ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,13ul);
+
+#endif
+}
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/Grid/gpu_test/cuda_grid_unit_tests.cu b/src/Grid/gpu_test/cuda_grid_unit_tests.cu
new file mode 100644
index 00000000..4218dc74
--- /dev/null
+++ b/src/Grid/gpu_test/cuda_grid_unit_tests.cu
@@ -0,0 +1,95 @@
+#include "config.h"
+#include <Grid/map_grid.hpp>
+#include "Point_test.hpp"
+
+__global__ void compute_stencil_grid(grid_gpu_ker<3,Point_test<float>> g1, grid_gpu_ker<3,Point_test<float>> g2, ite_gpu & ite_gpu)
+{
+
+	GRID_ID_3(ite_gpu);
+
+/*	g2.template get<0>(key) = g1.template get<0>(key.move(0,1)) + g1.template get<0>(key.move(0,-1)) +
+			                  g1.template get<0>(key.move(1,1)) + g1.template get<0>(key.move(1,-1)) +
+							  g1.template get<0>(key.move(2,1)) + g1.template get<0>(key.move(2,-1)) -
+							  6.0*g1.template get<0>(key);*/
+
+}
+
+__global__ void compute_stencil(float * prp_0, float * prp_1, int sz, grid_key_dx<3> start, grid_key_dx<3> stop)
+{
+	GRID_ID_3_TRAW(start,stop);
+
+	prp_1[tz*sz*sz + ty*sz + tx] = prp_0[tz*sz*sz + ty*sz + tx + 1] + prp_0[tz*sz*sz + ty*sz + tx - 1] +
+									   prp_0[tz*sz*sz + (ty + 1)*sz + tx] + prp_0[tz*sz*sz + (ty - 1)*sz + tx] +
+									   prp_0[(tz + 1)*sz*sz + ty*sz + tx + 1] + prp_0[(tz - 1)*sz*sz + ty*sz + tx - 1] -
+									   6.0*prp_0[tz*sz*sz + ty*sz + tx];
+}
+
+__global__ void fill_one(float * prp_0,int sz)
+{
+    // Thread index
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int ty = threadIdx.y + blockIdx.y * blockDim.y;
+    int tz = threadIdx.z + blockIdx.z * blockDim.z;
+
+	prp_0[tz*sz*sz + ty*sz + tx] = 1.0f;
+}
+
+__global__ void fill_count(float * prp_0,int sz)
+{
+    // Thread index
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int ty = threadIdx.y + blockIdx.y * blockDim.y;
+    int tz = threadIdx.z + blockIdx.z * blockDim.z;
+
+	prp_0[tz*sz*sz + ty*sz + tx] = tz*sz*sz + ty*sz + tx;
+}
+
+// call compute
+
+void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g)
+{
+    // Setup execution parameters
+    dim3 threads(8,8,8);
+    dim3 grid(8,8,8);
+
+    float * prp_0 = (float *)g.getDeviceBuffer<0>();
+
+	fill_one<<< grid, threads >>>(prp_0,64);
+}
+
+// call compute
+
+void gpu_grid_3D_compute(grid_gpu<3,Point_test<float>> & g)
+{
+    // Setup execution parameters
+    dim3 threads(8,8,8);
+    dim3 grid(8,8,8);
+
+    float * prp_0 = (float *)g.getDeviceBuffer<0>();
+
+	fill_count<<< grid, threads >>>(prp_0,64);
+}
+
+void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
+								 grid_key_dx<3> & start, grid_key_dx<3> & stop)
+{
+    // Setup execution parameters
+
+    float * prp_0 = (float *)g1.getDeviceBuffer<0>();
+    float * prp_1 = (float *)g2.getDeviceBuffer<0>();
+
+    auto gpu_it = g2.getGPUIterator(start,stop);
+
+    compute_stencil<<< gpu_it.thr, gpu_it.wthr >>>(prp_0,prp_1,64,start,stop);
+}
+
+void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
+		 	 	 	 	 	 	 	 grid_key_dx<3> & start, grid_key_dx<3> & stop)
+{
+	auto gpu_it = g2.getGPUIterator(start,stop);
+
+	auto g1k = g1.toGPU();
+
+	compute_stencil_grid<<< gpu_it.thr, gpu_it.wthr >>>(g1k,g1k,gpu_it);
+}
+
diff --git a/src/Grid/gpu_test/cuda_grid_unit_tests.cuh b/src/Grid/gpu_test/cuda_grid_unit_tests.cuh
new file mode 100644
index 00000000..98f068b8
--- /dev/null
+++ b/src/Grid/gpu_test/cuda_grid_unit_tests.cuh
@@ -0,0 +1,19 @@
+/*
+ * cuda_gpu_compute.cuh
+ *
+ *  Created on: Sep 29, 2017
+ *      Author: i-bird
+ */
+
+#ifndef OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_
+#define OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_
+
+
+void gpu_grid_3D_compute(grid_gpu<3,Point_test<float>> & g);
+void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
+		 	 	 	 	 	 	 grid_key_dx<3> & key1, grid_key_dx<3> & key2);
+void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g);
+void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
+		 	 	 	 	 	 	 	 grid_key_dx<3> & start, grid_key_dx<3> & stop);
+
+#endif /* OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_ */
diff --git a/src/Grid/grid_base_impl_layout.hpp b/src/Grid/grid_base_impl_layout.hpp
index 2eb3b31c..8fe5f29d 100644
--- a/src/Grid/grid_base_impl_layout.hpp
+++ b/src/Grid/grid_base_impl_layout.hpp
@@ -68,11 +68,6 @@ struct frswap
 	}
 };
 
-#ifdef __NVCC__
-#else
-#define __host__
-#define __device__
-#endif
 
 //! Case memory_traits_lin
 template<unsigned int p, typename layout, typename data_type, typename g1_type, typename key_type, unsigned int sel = 2*is_layout_mlin<layout>::value + is_layout_inte<layout>::value >
diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp
index 1482644d..6f52caf8 100644
--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -10,7 +10,45 @@
 
 #include "grid_base_impl_layout.hpp"
 
-#ifdef __NVCC__
+
+#ifdef CUDA_GPU
+
+#ifndef __NVCC__
+#undef __host__
+#undef __device__
+#include <vector_types.h>
+#endif
+
+#define GRID_ID_3_RAW(start,stop) int x[3] = {threadIdx.x + blockIdx.x * blockDim.x + start.get(0),\
+    									 threadIdx.y + blockIdx.y * blockDim.y + start.get(1),\
+										 threadIdx.z + blockIdx.z * blockDim.z + start.get(2)};\
+										 \
+										 if (x[0] > stop.get(0) || x[1] > stop.get(1) || x[2] > stop.get(2))\
+    									 {return;}
+
+#define GRID_ID_3_TRAW(start,stop) int tx = threadIdx.x + blockIdx.x * blockDim.x + start.get(0);\
+    							   int ty = threadIdx.y + blockIdx.y * blockDim.y + start.get(1);\
+								   int tz = threadIdx.z + blockIdx.z * blockDim.z + start.get(2);\
+										 \
+										 if (tx > stop.get(0) || ty > stop.get(1) || tz > stop.get(2))\
+    									 {return;}
+
+#define GRID_ID_3(ite_gpu) grid_key_dx<3> key;\
+							  key.set_d(0,threadIdx.x + blockIdx.x * blockDim.x + ite_gpu.start.get(0));\
+    						  key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\
+							  key.set_d(2,threadIdx.z + blockIdx.z * blockDim.z + ite_gpu.start.get(2));\
+										 \
+										 if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\
+    									 {return;}
+
+struct ite_gpu
+{
+	dim3 thr;
+	dim3 wthr;
+
+	grid_key_dx<3> start;
+	grid_key_dx<3> stop;
+};
 #else
 #define __host__
 #define __device__
@@ -328,6 +366,45 @@ public:
 		return grid_new;
 	}
 
+#ifdef CUDA_GPU
+	struct ite_gpu getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024)
+	{
+		size_t n = n_thr;
+
+		// Work to do
+		ite_gpu ig;
+
+		ig.thr.x = 1;
+		ig.thr.y = 1;
+		ig.thr.z = 1;
+
+		int dir = 0;
+
+		while (n != 1)
+		{
+			if (dir % 3 == 0)
+			{ig.thr.x = ig.thr.x << 1;}
+			else if (dir % 3 == 1)
+			{ig.thr.y = ig.thr.y << 1;}
+			else if (dir % 3 == 2)
+			{ig.thr.z = ig.thr.z << 1;}
+
+			n = n >> 1;
+
+			dir++;
+		}
+
+		ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0);
+		ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0);
+		ig.wthr.z = (key2.get(2) - key1.get(2) + 1) / ig.thr.z + (((key2.get(2) - key1.get(2) + 1)%ig.thr.z != 0)?1:0);
+
+		ig.start = key1;
+		ig.stop = key2;
+
+		return ig;
+	}
+#endif
+
 	/*! \brief Return the internal grid information
 	 *
 	 * Return the internal grid information
diff --git a/src/Grid/grid_gpu.hpp b/src/Grid/grid_gpu.hpp
deleted file mode 100644
index 9c59b641..00000000
--- a/src/Grid/grid_gpu.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * grid_gpu.hpp
- *
- *  Created on: Oct 31, 2015
- *      Author: i-bird
- */
-
-#ifndef OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_
-#define OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_
-
-/*! \brief this class is a functor for "for_each" algorithm
- *
- * This class is a functor for "for_each" algorithm. For each
- * element of the boost::vector the operator() is called.
- * Is mainly used to copy one encap into another encap object
- *
- * \tparam encap source
- * \tparam encap dst
- *
- */
-
-template<typename T_type>
-struct copy_memory_c
-{
-	//! encapsulated source object
-	const typename memory_traits_inte<T_type>::type & src;
-	//! encapsulated destination object
-	typename memory_traits_inte_red<T_type>::type & dst;
-
-
-	/*! \brief constructor
-	 *
-	 * \param src source encapsulated object
-	 * \param dst source encapsulated object
-	 *
-	 */
-	inline copy_memory_c(const typename memory_traits_inte<T_type>::type & src,
-			                   typename memory_traits_inte_red<T_type>::type & dst)
-	:src(src),dst(dst)
-	{
-	};
-
-
-	//! It call the copy function for each property
-	template<typename T>
-	inline void operator()(T& t) const
-	{
-		boost::fusion::at_c<T::value>(dst).mem_r = boost::fusion::at_c<T::value>(src).mem_r;
-	}
-};
-
-struct dim3_
-{
-	//! size in x dimension
-	unsigned int x;
-
-	//! size in y dimension
-	unsigned int y;
-
-	//! size in z dimension
-	unsigned int z;
-};
-
-template<unsigned int dim>
-struct device_grid
-{
-	//! number of treads in each block
-	dim3_ threads;
-
-	//! number of grid for the kernel execution
-	dim3_ grids;
-};
-
-
-/*! \brief This is an N-dimensional grid or an N-dimensional array with memory_traits_inte layout
- *
- * it is basically an N-dimensional Cartesian grid
- *
- *	\tparam dim Dimensionality of the grid
- *	\tparam T type of object the grid store
- *	\tparam Mem memory layout
- *
- * ### Definition and allocation of a 3D grid on GPU memory
- * \snippet grid_unit_tests.hpp Definition and allocation of a 3D grid on GPU memory
- * ### Access a grid c3 of size sz on each direction
- * \snippet grid_unit_tests.hpp Access a grid c3 of size sz on each direction
- * ### Access to an N-dimensional grid with an iterator
- * \snippet grid_unit_tests.hpp Access to an N-dimensional grid with an iterator
- *
- */
-template<unsigned int dim, typename T, typename S>
-class grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> : public grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>
-{
-	//! grid layout
-	typedef typename memory_traits_inte<T>::type layout;
-
-public:
-
-	//! Object container for T, it is the return type of get_o it return a object type trough
-	// you can access all the properties of T
-	typedef typename grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::container container;
-
-	//! Default constructor
-	inline grid_cpu() THROW
-	:grid_base_impl<dim,T,S,layout,memory_traits_inte>()
-	{
-	}
-
-	/*! \brief create a grid from another grid
-	 *
-	 * \param g the grid to copy
-	 *
-	 */
-	inline grid_cpu(const grid_cpu & g) THROW
-	:grid_base_impl<dim,T,S,layout,memory_traits_inte>(g)
-	{
-	}
-
-	/*! \brief create a grid of size sz on each direction
-	 *
-	 * \param sz grid size in each direction
-	 *
-	 */
-	inline grid_cpu(const size_t & sz) THROW
-	:grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz)
-	{
-	}
-
-	//! Constructor allocate memory and give them a representation
-	inline grid_cpu(const size_t (& sz)[dim]) THROW
-	:grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz)
-	{
-	}
-
-	/*! \brief It return the properties arrays.
-	 *
-	 * In case of Cuda memory it return the device pointers to pass to the kernels
-	 *
-	 *
-	 */
-	template<unsigned int id> void * getDeviceBuffer()
-	{
-		return boost::fusion::at_c<id>(this->data_).mem->getDevicePointer();
-	}
-
-	/*! \brief Synchronize the memory buffer in the device with the memory in the host
-	 *
-	 *
-	 */
-	template<unsigned int id> void deviceToHost()
-	{
-		return boost::fusion::at_c<id>(this->data_).mem->deviceToHost();
-	}
-
-	/*! \brief Convert the grid into a data-structure compatible for computing into GPU
-	 *
-	 *  The object created can be considered like a reference of the original
-	 *
-	 */
-	grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> toGPU()
-	{
-		grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> g;
-		copy_memory_c<T> cp_mc(this->data_,g.data_);
-
-		boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(cp_mc);
-
-		return g;
-	}
-
-	/*! \brief When we switch to GPU mode the data-structure can be safely given to
-	 *         a kernel for computation
-	 *
-	 *
-	 */
-	void switchToGPU()
-	{
-
-	}
-
-	void switchToCPU()
-	{
-
-	}
-};
-
-//! short formula for a grid on gpu
-template <unsigned int dim, typename T> using grid_gpu = grid_cpu<dim,T,CudaMemory,typename memory_traits_inte<T>::type>;
-
-
-#endif /* OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_ */
diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp
index 875f24fb..33225037 100644
--- a/src/Grid/grid_key.hpp
+++ b/src/Grid/grid_key.hpp
@@ -31,7 +31,7 @@ public:
 	}
 
 	//! Constructor
-	inline grid_key_dx()
+	__device__ __host__  inline grid_key_dx()
 	{}
 
 	/*! \brief Constructor from initializer list
@@ -56,7 +56,7 @@ public:
 	 * \param key copy constructor
 	 *
 	 */
-	inline grid_key_dx(const grid_key_dx<dim> & key)
+	__device__ __host__ inline grid_key_dx(const grid_key_dx<dim> & key)
 	:grid_key_dx(key.k)
 	{
 	}
@@ -66,7 +66,7 @@ public:
 	 * \param k reference buffer
 	 *
 	 */
-	inline grid_key_dx(const size_t (&k)[dim])
+	__device__ __host__ inline grid_key_dx(const size_t (&k)[dim])
 	{
 		for (size_t i = 0 ; i < dim ; i++)
 			this->k[i] = k[i];
@@ -77,7 +77,7 @@ public:
 	 * \param k reference buffer
 	 *
 	 */
-	inline grid_key_dx(const long int (&k)[dim])
+	__device__ __host__ inline grid_key_dx(const long int (&k)[dim])
 	{
 		for (size_t i = 0 ; i < dim ; i++)
 			this->k[i] = k[i];
@@ -110,6 +110,15 @@ public:
 		invert_assign(t...);
 	}
 
+	__device__ __host__ inline  grid_key_dx<dim> move(int i, int m)
+	{
+		grid_key_dx<dim> tmp = *this;
+
+		tmp.k[i] += m;
+
+		return tmp;
+	}
+
 	/*! \brief Set to zero the key
 	 *
 	 */
@@ -380,7 +389,7 @@ public:
 	 * \return the index value
 	 *
 	 */
-	mem_id get(size_t i) const
+	__device__ __host__  mem_id get(size_t i) const
 	{
 		return k[i];
 	}
@@ -393,9 +402,9 @@ public:
 	 * \param id value to set
 	 *
 	 */
-	void set_d(size_t i, mem_id id)
+	__device__ __host__   void set_d(size_t i, mem_id id)
 	{
-#ifdef DEBUG
+#ifdef SE_CLASS1
 
 		if (i >= dim)
 			std::cerr << "grid_key_dx error: " << __FILE__ << " " << __LINE__ << " try to access dimension " << i << " on a grid_key_dx of size " << dim << "\n";
diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index 6ee9c259..40637246 100755
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -295,7 +295,8 @@ public:
 	 * \return The linearization of the gk key shifted by c, or -1 if the check fail
 	 */
 
-	template<typename check=NoCheck> inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N]) const
+	template<typename check=NoCheck>
+	inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N]) const
 	{
 		mem_id lid;
 
@@ -330,7 +331,8 @@ public:
 	 * \return The linearization of the gk key shifted by c, or -1 if the check fail
 	 */
 
-	template<typename check=NoCheck> inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N], const size_t (&bc)[N]) const
+	template<typename check=NoCheck>
+	inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N], const size_t (&bc)[N]) const
 	{
 		mem_id lid;
 
@@ -397,7 +399,7 @@ public:
 	 *
 	 */
 
-	inline mem_id LinId(const size_t (& k)[N]) const
+	__device__ __host__ inline mem_id LinId(const size_t (& k)[N]) const
 	{
 		mem_id lid = k[0];
 		for (mem_id i = 1 ; i < N ; i++)
@@ -418,7 +420,7 @@ public:
 	 *
 	 */
 
-	inline mem_id LinId(const grid_key_dx<N> & gk) const
+	__device__ __host__ inline mem_id LinId(const grid_key_dx<N> & gk) const
 	{
 		mem_id lid = gk.k[0];
 		for (mem_id i = 1 ; i < N ; i++)
diff --git a/src/Grid/grid_unit_tests.hpp b/src/Grid/grid_unit_tests.hpp
index 49a95eb7..9e41a165 100644
--- a/src/Grid/grid_unit_tests.hpp
+++ b/src/Grid/grid_unit_tests.hpp
@@ -7,7 +7,7 @@
 #include "Space/Shape/HyperCube.hpp"
 #include "timer.hpp"
 #include "grid_util_test.hpp"
-#include "cuda_gpu_compute.cuh"
+#include "gpu_test/cuda_grid_unit_tests.cuh"
 
 #ifdef TEST_COVERAGE_MODE
 #define GS_SIZE 8
@@ -784,7 +784,165 @@ BOOST_AUTO_TEST_CASE (gpu_computation)
 
 	BOOST_REQUIRE_EQUAL(good,true);
 
-	gpu_grid_3D_fill(c3);
+	}
+
+	#endif
+}
+
+BOOST_AUTO_TEST_CASE (gpu_computation_stencil)
+{
+	#ifdef CUDA_GPU
+
+	{
+	size_t sz[3] = {64,64,64};
+	grid_gpu<3, Point_test<float> > c3(sz);
+	grid_gpu<3, Point_test<float> > c2(sz);
+	grid_key_dx<3> key1({1,1,1});
+	grid_key_dx<3> key2({62,62,62});
+
+
+	c3.setMemory();
+	c2.setMemory();
+	test_layout_gridNd<3>(c3,sz[0]);
+	test_layout_gridNd<3>(c2,sz[0]);
+
+	gpu_grid_3D_one(c2);
+
+	// Check property 1 is 1.0
+	c2.deviceToHost<0>();
+
+	{
+	auto it = c2.getIterator();
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c2.get<0>(key) == 1.0;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
+	}
+
+	gpu_grid_3D_compute(c3);
+	c3.deviceToHost<0>();
+
+	{
+	auto it = c3.getIterator();
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c3.getGrid().LinId(key) == c3.get<0>(key);
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
+	}
+
+	gpu_grid_3D_compute_stencil(c3,c2,key1,key2);
+
+	c2.deviceToHost<0>();
+
+	auto it = c2.getIterator(key1,key2);
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c2.get<0>(key) == 0;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
+
+	}
+
+	#endif
+}
+
+BOOST_AUTO_TEST_CASE (gpu_computation_grid_stencil)
+{
+	#ifdef CUDA_GPU
+
+	{
+	size_t sz[3] = {64,64,64};
+	grid_gpu<3, Point_test<float> > c3(sz);
+	grid_gpu<3, Point_test<float> > c2(sz);
+	grid_key_dx<3> key1({1,1,1});
+	grid_key_dx<3> key2({62,62,62});
+
+
+	c3.setMemory();
+	c2.setMemory();
+	test_layout_gridNd<3>(c3,sz[0]);
+	test_layout_gridNd<3>(c2,sz[0]);
+
+	gpu_grid_3D_one(c2);
+
+	// Check property 1 is 1.0
+	c2.deviceToHost<0>();
+
+	{
+	auto it = c2.getIterator();
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c2.get<0>(key) == 1.0;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
+	}
+
+	gpu_grid_3D_compute(c3);
+	c3.deviceToHost<0>();
+
+	{
+	auto it = c3.getIterator();
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c3.getGrid().LinId(key) == c3.get<0>(key);
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
+	}
+
+	gpu_grid_3D_compute_grid_stencil(c3,c2,key1,key2);
+
+	c2.deviceToHost<0>();
+
+	auto it = c2.getIterator(key1,key2);
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c2.get<0>(key) == 0;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
 
 	}
 
diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp
index 185d538d..3ae30d18 100755
--- a/src/Grid/map_grid.hpp
+++ b/src/Grid/map_grid.hpp
@@ -3,6 +3,12 @@
 
 #include "config.h"
 
+#ifndef CUDA_GPU
+#include <boost/config/compiler/nvcc.hpp>
+#define BOOST_FUSION_GPU_ENABLED
+#define BOOST_GPU_ENABLED
+#endif
+
 //! Warning: apparently you cannot used nested boost::mpl with boost::fusion
 //! can create template circularity, this include avoid the problem
 #include "util/object_util.hpp"
@@ -58,6 +64,7 @@
 typedef HeapMemory CudaMemory;
 #endif
 
+
 /*! Stub grid class
  *
  */
@@ -173,7 +180,259 @@ public:
 	}
 };
 
-#include "grid_gpu.hpp"
+
+/*! \brief this class is a functor for "for_each" algorithm
+ *
+ * This class is a functor for "for_each" algorithm. For each
+ * element of the boost::vector the operator() is called.
+ * Is mainly used to copy one encap into another encap object
+ *
+ * \tparam encap source
+ * \tparam encap dst
+ *
+ */
+
+template<typename T_type>
+struct copy_switch_memory_c
+{
+	//! encapsulated source object
+	const typename memory_traits_inte<T_type>::type & src;
+	//! encapsulated destination object
+	typename memory_traits_inte<T_type>::type & dst;
+
+
+	/*! \brief constructor
+	 *
+	 * \param src source encapsulated object
+	 * \param dst source encapsulated object
+	 *
+	 */
+	inline copy_switch_memory_c(const typename memory_traits_inte<T_type>::type & src,
+			                   typename memory_traits_inte<T_type>::type & dst)
+	:src(src),dst(dst)
+	{
+	};
+
+
+	//! It call the copy function for each property
+	template<typename T>
+	inline void operator()(T& t) const
+	{
+		boost::fusion::at_c<T::value>(dst).mem = boost::fusion::at_c<T::value>(src).mem;
+		boost::fusion::at_c<T::value>(dst).mem_r.bind_ref(boost::fusion::at_c<T::value>(src).mem_r);
+		boost::fusion::at_c<T::value>(dst).switchToDevicePtr();
+	}
+};
+
+/*! \brief grid interface available when on gpu
+ *
+ * \tparam n_buf number of template buffers
+ *
+ */
+
+template<unsigned int dim, typename T>
+struct grid_gpu_ker
+{
+	//! grid information
+	grid_sm<dim,void> g1;
+
+	//! type of layout of the structure
+	typedef typename memory_traits_inte<T>::type layout;
+
+	//! layout data
+	layout data_;
+
+	grid_gpu_ker()
+	{}
+
+	grid_gpu_ker(const grid_gpu_ker & cpy)
+	:g1(cpy.g1)
+	{
+		copy_switch_memory_c<T> bp_mc(cpy.data_,this->data_);
+
+		boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(bp_mc);
+	}
+
+	/*! \brief Get the reference of the selected element
+	 *
+	 * \param v1 grid_key that identify the element in the grid
+	 *
+	 * \return the reference of the element
+	 *
+	 */
+	template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get(data_,g1,grid_key_dx<dim>()))>
+	__device__ inline r_type get(const grid_key_dx<dim> & v1)
+	{
+		return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),decltype(v1)>::get(data_,g1,v1);
+	}
+
+	/*! \brief Get the const reference of the selected element
+	 *
+	 * \param v1 grid_key that identify the element in the grid
+	 *
+	 * \return the const reference of the element
+	 *
+	 */
+	template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get(data_,g1,grid_key_dx<dim>()))>
+	__device__ inline const r_type get(const grid_key_dx<dim> & v1) const
+	{
+		return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),decltype(v1)>::get(data_,g1,v1);
+	}
+
+	/*! \brief Get the reference of the selected element
+	 *
+	 * \param lin_id linearized element that identify the element in the grid
+	 *
+	 * \return the reference of the element
+	 *
+	 */
+	template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get_lin(data_,g1,0))>
+	__device__ inline r_type get(const size_t lin_id)
+	{
+		return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),grid_key_dx<dim>>::get_lin(data_,g1,lin_id);
+	}
+
+	/*! \brief Get the const reference of the selected element
+	 *
+	 * \param lin_id linearized element that identify the element in the grid
+	 *
+	 * \return the const reference of the element
+	 *
+	 */
+	template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get_lin(data_,g1,0))>
+	__device__ inline const r_type get(size_t lin_id) const
+	{
+		return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),grid_key_dx<dim>>::get_lin(data_,g1,lin_id);
+	}
+};
+
+struct dim3_
+{
+	//! size in x dimension
+	unsigned int x;
+
+	//! size in y dimension
+	unsigned int y;
+
+	//! size in z dimension
+	unsigned int z;
+};
+
+template<unsigned int dim>
+struct device_grid
+{
+	//! number of treads in each block
+	dim3_ threads;
+
+	//! number of grid for the kernel execution
+	dim3_ grids;
+};
+
+
+/*! \brief This is an N-dimensional grid or an N-dimensional array with memory_traits_inte layout
+ *
+ * it is basically an N-dimensional Cartesian grid
+ *
+ *	\tparam dim Dimensionality of the grid
+ *	\tparam T type of object the grid store
+ *	\tparam Mem memory layout
+ *
+ * ### Definition and allocation of a 3D grid on GPU memory
+ * \snippet grid_unit_tests.hpp Definition and allocation of a 3D grid on GPU memory
+ * ### Access a grid c3 of size sz on each direction
+ * \snippet grid_unit_tests.hpp Access a grid c3 of size sz on each direction
+ * ### Access to an N-dimensional grid with an iterator
+ * \snippet grid_unit_tests.hpp Access to an N-dimensional grid with an iterator
+ *
+ */
+template<unsigned int dim, typename T, typename S>
+class grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> : public grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>
+{
+	//! grid layout
+	typedef typename memory_traits_inte<T>::type layout;
+
+public:
+
+	//! Object container for T, it is the return type of get_o it return a object type trough
+	// you can access all the properties of T
+	typedef typename grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::container container;
+
+	//! Default constructor
+	inline grid_cpu() THROW
+	:grid_base_impl<dim,T,S,layout,memory_traits_inte>()
+	{
+	}
+
+	/*! \brief create a grid from another grid
+	 *
+	 * \param g the grid to copy
+	 *
+	 */
+	inline grid_cpu(const grid_cpu & g) THROW
+	:grid_base_impl<dim,T,S,layout,memory_traits_inte>(g)
+	{
+	}
+
+	/*! \brief create a grid of size sz on each direction
+	 *
+	 * \param sz grid size in each direction
+	 *
+	 */
+	inline grid_cpu(const size_t & sz) THROW
+	:grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz)
+	{
+	}
+
+	//! Constructor allocate memory and give them a representation
+	inline grid_cpu(const size_t (& sz)[dim]) THROW
+	:grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz)
+	{
+	}
+
+	/*! \brief It return the properties arrays.
+	 *
+	 * In case of Cuda memory it return the device pointers to pass to the kernels
+	 *
+	 *
+	 */
+	template<unsigned int id> void * getDeviceBuffer()
+	{
+		return boost::fusion::at_c<id>(this->data_).mem->getDevicePointer();
+	}
+
+	/*! \brief Synchronize the memory buffer in the device with the memory in the host
+	 *
+	 *
+	 */
+	template<unsigned int id> void deviceToHost()
+	{
+		return boost::fusion::at_c<id>(this->data_).mem->deviceToHost();
+	}
+
+
+
+#ifdef CUDA_GPU
+
+	/*! \brief Convert the grid into a data-structure compatible for computing into GPU
+	 *
+	 *  The object created can be considered like a reference of the original
+	 *
+	 */
+	grid_gpu_ker<dim,T> toGPU()
+	{
+		grid_gpu_ker<dim,T> g;
+		copy_switch_memory_c<T> cp_mc(this->data_,g.data_);
+
+		boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(cp_mc);
+
+		return g;
+	}
+
+#endif
+};
+
+//! short formula for a grid on gpu
+template <unsigned int dim, typename T> using grid_gpu = grid_cpu<dim,T,CudaMemory,typename memory_traits_inte<T>::type>;
 
 #endif
 
diff --git a/src/Makefile.am b/src/Makefile.am
index 598077cc..ba342189 100755
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,7 +1,7 @@
 
 LINKLIBS = $(LIBHILBERT_LIB) $(PTHREAD_LIBS) $(OPT_LIBS) $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) $(CUDA_LIBS) $(BOOST_IOSTREAMS_LIB) $(LOCAL_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(BOOST_CHRONO_LIB) $(BOOST_TIMER_LIB) $(BOOST_SYSTEM_LIB)
 
-NVCCFLAGS:=$(NVCCFLAGS) -ccbin=g++-5.4.0 --std=c++11
+NVCCFLAGS:=$(NVCCFLAGS) -g --std=c++11
 
 if BUILDCUDA
  CUDA_SOURCES=../../openfpm_devices/src/memory/CudaMemory.cu
@@ -10,14 +10,14 @@ else
 endif
 
 noinst_PROGRAMS = mem_map
-mem_map_SOURCES = ../../openfpm_devices/src/Memleak_check.cpp main.cpp $(CUDA_SOURCES) ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp Grid/cuda_gpu_compute.cu
+mem_map_SOURCES = ../../openfpm_devices/src/Memleak_check.cpp main.cpp Grid/gpu_test/cuda_gpu_func.cpp  $(CUDA_SOURCES) ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp Grid/gpu_test/cuda_grid_unit_tests.cu
 mem_map_CXXFLAGS = $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I/usr/local/include -I/usr/local/libhilbert/include
 mem_map_CFLAGS = $(CUDA_CFLAGS)
 mem_map_LDADD = $(LINKLIBS)
 
 nobase_include_HEADERS= data_type/scalar.hpp data_type/aggregate.hpp \
 Graph/graph_unit_tests.hpp Graph/map_graph.hpp \
-Grid/comb.hpp Grid/grid_base_implementation.hpp Grid/grid_pack_unpack.ipp  Grid/grid_base_impl_layout.hpp Grid/grid_common.hpp Grid/grid_gpu.hpp Grid/Encap.hpp Grid/grid_key.hpp Grid/grid_key_dx_expression_unit_tests.hpp Grid/grid_key_expression.hpp Grid/grid_sm.hpp Grid/grid_unit_tests.hpp Grid/grid_util_test.hpp Grid/map_grid.hpp Grid/se_grid.hpp Grid/util.hpp \
+Grid/comb.hpp Grid/grid_base_implementation.hpp Grid/grid_pack_unpack.ipp  Grid/grid_base_impl_layout.hpp Grid/grid_common.hpp Grid/Encap.hpp Grid/grid_key.hpp Grid/grid_key_dx_expression_unit_tests.hpp Grid/grid_key_expression.hpp Grid/grid_sm.hpp Grid/grid_unit_tests.hpp Grid/grid_util_test.hpp Grid/map_grid.hpp Grid/se_grid.hpp Grid/util.hpp \
 Grid/iterators/grid_key_dx_iterator_sp.hpp Grid/grid_key_dx_iterator_hilbert.hpp Grid/iterators/stencil_type.hpp  Grid/iterators/grid_key_dx_iterator_sub_bc.hpp Grid/iterators/grid_key_dx_iterator_sub.hpp Grid/iterators/grid_key_dx_iterator.hpp Grid/iterators/grid_skin_iterator.hpp  \
 Point_test.hpp \
 Point_orig.hpp \
diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp
index f7200ff1..3ec5727d 100644
--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -653,8 +653,8 @@ namespace openfpm
 		 *
 		 */
 
-		template <unsigned int p, typename r_type=decltype(std::declval<const grid_cpu<1,T,Memory,layout>>().template get<p>(grid_key_dx<1>(0)))>
-		inline const r_type get(size_t id) const
+		template <unsigned int p>
+		inline auto get(size_t id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
 		{
 #ifdef SE_CLASS2
 			check_valid(this,8);
@@ -763,7 +763,8 @@ namespace openfpm
 		 *
 		 */
 
-		template <unsigned int p, typename r_type=decltype(std::declval<grid_cpu<1,T,Memory,layout>>().template get<p>(grid_key_dx<1>(0)))> inline r_type get(size_t id)
+		template <unsigned int p>
+		inline auto get(size_t id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
 		{
 #ifdef SE_CLASS2
 			check_valid(this,8);
diff --git a/src/memory_ly/memory_array.hpp b/src/memory_ly/memory_array.hpp
index 2e8e9256..c1b7f036 100644
--- a/src/memory_ly/memory_array.hpp
+++ b/src/memory_ly/memory_array.hpp
@@ -98,6 +98,16 @@ class memory_array
 		return ptr;
 	}
 
+	/*! \brief Set from another memory array
+	 *
+	 * \param the other memory_array
+	 *
+	 */
+	void bind_ref(const memory_array<T> & ref)
+	{
+		ptr = ref.ptr;
+	}
+
 	/*! \brief Access element an element of the array
 	 *
 	 * \param i element to access
diff --git a/src/memory_ly/memory_c.hpp b/src/memory_ly/memory_c.hpp
index c4c7bb16..92cc4c08 100644
--- a/src/memory_ly/memory_c.hpp
+++ b/src/memory_ly/memory_c.hpp
@@ -76,6 +76,23 @@ class memory_c<T,MEMORY_C_STANDARD,D>
 		this->mem = &mem;
 	}
 
+	/*! \brief This function bind the memory_c to this memory_c as reference
+	 *
+	 * Bind ad reference it mean this this object does not create new memory but use the one from ref
+	 * as a reference.
+	 *
+	 */
+	bool bind_ref(const memory_c<T,MEMORY_C_STANDARD,D> & ref)
+	{
+	    mem = ref.mem;
+	    mem->incRef();
+
+	    //! we create the representation for the memory buffer
+	    mem_r = ref.mem_r;
+
+	    return true;
+	}
+
 	/*! \brief This function get the object that allocate memory
 	 *
 	 * \return memory object to allocate memory
@@ -87,6 +104,14 @@ class memory_c<T,MEMORY_C_STANDARD,D>
 		return *this->mem;
 	}
 
+	/*! \brief Switch the pointer to device pointer
+	 *
+	 */
+	void switchToDevicePtr()
+	{
+		mem_r.set_pointer(mem->getDevicePointer());
+	}
+
 	/*! \brief This function get the object that allocate memory
 	 *
 	 * \return memory object to allocate memory
@@ -276,6 +301,52 @@ static inline std::array<size_type ,dim> zero_dims()
 	return dimensions;
 }
 
+template<unsigned int dim, typename size_type>
+struct array_ord
+{
+};
+
+template<typename size_type>
+struct array_ord<4,size_type>
+{
+	static constexpr size_type data[4] = {0,3,2,1};
+};
+
+template<typename size_type>
+struct array_ord<3,size_type>
+{
+	static constexpr size_type data[3] = {0,2,1};
+};
+
+template<typename size_type>
+struct array_ord<2,size_type>
+{
+	static constexpr size_type data[2] = {0,1};
+};
+
+template<unsigned int dim>
+struct array_asc
+{
+};
+
+template<>
+struct array_asc<4>
+{
+	static constexpr bool data[4] = {true,true,true,true};
+};
+
+template<>
+struct array_asc<3>
+{
+	static constexpr bool data[3] = {true,true,true};
+};
+
+template<>
+struct array_asc<2>
+{
+	static constexpr bool data[2] = {true,true};
+};
+
 
 /*! \brief Specialization of memory_c for multi_array
  *
@@ -331,12 +402,21 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D>
 	//! define size_type
 	typedef typename boost::multi_array_ref<base,size_p::value>::size_type size_type;
 
+//#ifdef __NVCC__
+
+	array_ord<size_p::value,typename boost::multi_array<T,size_p::value>::size_type> ord;
+
+	array_asc<size_p::value> asc;
+
+//#else
+
     //! we generate the ordering buffer ord::data = {0,N-1 ...... 1 }
-    typedef typename generate_array<typename boost::multi_array<T,size_p::value>::size_type,size_p::value, ordering>::result ord;
+//    typedef typename generate_array<typename boost::multi_array<T,size_p::value>::size_type,size_p::value, ordering>::result ord;
 
     // we generate the ascending buffer
-    typedef typename generate_array<bool,size_p::value, ascending>::result asc;
+//    typedef typename generate_array<bool,size_p::value, ascending>::result asc;
 
+//#endif
 
 	public:
 
@@ -370,6 +450,31 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D>
 		return *this->mem;
 	}
 
+	/*! \brief Switch the pointer to device pointer
+	 *
+	 */
+	void switchToDevicePtr()
+	{
+		mem_r.set_pointer(mem->getDevicePointer());
+	}
+
+	/*! \brief This function bind the memory_c to this memory_c as reference
+	 *
+	 * Bind ad reference it mean this this object does not create new memory but use the one from ref
+	 * as a reference.
+	 *
+	 */
+	bool bind_ref(const memory_c<multi_array<T>, MEMORY_C_STANDARD, D> & ref)
+	{
+	    mem = ref.mem;
+	    mem->incRef();
+
+	    //! we create the representation for the memory buffer
+	    mem_r = ref.mem_r;
+
+	    return true;
+	}
+
 	/*! \brief This function allocate memory and associate the representation to mem_r
 	 *
 	 * This function allocate memory and associate the representation of that chunk of
@@ -394,7 +499,7 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D>
 	    for (int i = 0 ; i < size_p::value-1 ; i++)
 	    {dimensions[i+1] = dims::data[i];}
 
-	    boost::multi_array_ref_openfpm<base,size_p::value> tmp(static_cast<base *>(mem->getPointer()),dimensions,boost::general_storage_order<size_p::value>(ord::data,asc::data));
+	    boost::multi_array_ref_openfpm<base,size_p::value> tmp(static_cast<base *>(mem->getPointer()),dimensions,boost::general_storage_order<size_p::value>(boost::ofp_storage_order()));
 
 	    //! we create the representation for the memory buffer
 	    mem_r.swap(tmp);
@@ -415,7 +520,8 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D>
 	//! constructor
 	memory_c()
 	:mem(NULL),
-	 mem_r(static_cast<base *>(NULL),zero_dims<size_type,size_p::value>(),boost::general_storage_order<size_p::value>(ord::data,asc::data))
+	 mem_r(static_cast<base *>(NULL),zero_dims<size_type,size_p::value>(),
+		   boost::ofp_storage_order())
 	{}
 
 	//! destructor
diff --git a/src/memory_ly/memory_conf.hpp b/src/memory_ly/memory_conf.hpp
index 41bdfdde..1553523d 100644
--- a/src/memory_ly/memory_conf.hpp
+++ b/src/memory_ly/memory_conf.hpp
@@ -94,7 +94,7 @@ struct memory_traits_inte
  *
  *
  */
-template<typename T>
+/*template<typename T>
 struct memory_traits_inte_red
 {
 	//! for each element in the vector interleave memory_c
@@ -102,7 +102,7 @@ struct memory_traits_inte_red
 
 	//! indicate that it change the memory layout from the original
 	typedef int yes_is_inte;
-};
+};*/
 
 /*! \brief small meta-function to get the type of the memory
  *
diff --git a/src/util/boost_multi_array_openfpm.hpp b/src/util/boost_multi_array_openfpm.hpp
index f72bb5df..8aa9b899 100644
--- a/src/util/boost_multi_array_openfpm.hpp
+++ b/src/util/boost_multi_array_openfpm.hpp
@@ -42,6 +42,90 @@
 
 namespace boost {
 
+  // RG - This is to make things work with VC++. So sad, so sad.
+  class c_storage_order;
+  class fortran_storage_order;
+  class ofp_storage_order;
+
+  template <std::size_t NumDims>
+  class general_storage_order_ofp
+  {
+  public:
+    typedef detail::multi_array::size_type size_type;
+    template <typename OrderingIter, typename AscendingIter>
+    general_storage_order_ofp(OrderingIter ordering,
+                          AscendingIter ascending) {
+      boost::detail::multi_array::copy_n(ordering,NumDims,ordering_.begin());
+      boost::detail::multi_array::copy_n(ascending,NumDims,ascending_.begin());
+    }
+
+    // RG - ideally these would not be necessary, but some compilers
+    // don't like template conversion operators.  I suspect that not
+    // too many folk will feel the need to use customized
+    // storage_order objects, I sacrifice that feature for compiler support.
+    general_storage_order_ofp(const c_storage_order&) {
+      for (size_type i=0; i != NumDims; ++i) {
+        ordering_[i] = NumDims - 1 - i;
+      }
+      ascending_.assign(true);
+    }
+
+    general_storage_order_ofp(const fortran_storage_order&) {
+      for (size_type i=0; i != NumDims; ++i) {
+        ordering_[i] = i;
+      }
+      ascending_.assign(true);
+    }
+
+    general_storage_order_ofp(const ofp_storage_order&) {
+      ordering_[NumDims - 1] = 0;
+
+      for (size_type i=0; i != NumDims; ++i) {
+        ordering_[i] = i + 1;
+      }
+      ascending_.assign(true);
+    }
+
+    size_type ordering(size_type dim) const { return ordering_[dim]; }
+    bool ascending(size_type dim) const { return ascending_[dim]; }
+
+    bool all_dims_ascending() const {
+      return std::accumulate(ascending_.begin(),ascending_.end(),true,
+                      std::logical_and<bool>());
+    }
+
+    bool operator==(general_storage_order_ofp const& rhs) const {
+      return (ordering_ == rhs.ordering_) &&
+        (ascending_ == rhs.ascending_);
+    }
+
+  protected:
+    boost::array<size_type,NumDims> ordering_;
+    boost::array<bool,NumDims> ascending_;
+  };
+
+  class ofp_storage_order
+  {
+    typedef detail::multi_array::size_type size_type;
+  public:
+    // This is the idiom for creating your own custom storage orders.
+    // Not supported by all compilers though!
+#ifndef __MWERKS__ // Metrowerks screams "ambiguity!"
+    template <std::size_t NumDims>
+    operator general_storage_order<NumDims>() const {
+      boost::array<size_type,NumDims> ordering;
+      boost::array<bool,NumDims> ascending;
+
+      for (size_type i=0; i != NumDims; ++i) {
+        ordering[i] = NumDims - 1 - i;
+        ascending[i] = true;
+      }
+      return general_storage_order<NumDims>(ordering.begin(),
+                                            ascending.begin());
+    }
+#endif
+  };
+
 
 template <typename T, std::size_t NumDims>
 class multi_array_ref_openfpm :
@@ -102,7 +186,7 @@ public:
                            const detail::multi_array::
                            extent_gen<NumDims>&
                              ranges,
-                           const general_storage_order<NumDims>& so) :
+                           const general_storage_order_ofp<NumDims>& so) :
     super_type(base,ranges,so) { }
 
 
@@ -136,9 +220,35 @@ public:
     return *this;
   }
 
+  multi_array_ref_openfpm & bind_ref(const multi_array_ref_openfpm & other) {
+    if (&other != this) {
+
+        this->base_ = other.base_;
+        this->storage_ = other.storage_;
+        this->extent_list_ = other.extent_list_;
+        this->stride_list_ = other.stride_list_;
+        this->index_base_list_ = other.index_base_list_;
+        this->origin_offset_ = other.origin_offset_;
+        this->directional_offset_ = other.directional_offset_;
+        this->num_elements_ = other.num_elements_;
+
+    }
+    return *this;
+  }
+
+  /* \brief Set the internal pointer
+   *
+   * \param base internal pointer
+   *
+   */
+  void set_pointer(void * base)
+  {
+	  this->base_ = static_cast<T *>(base);
+  }
+
   multi_array_ref_openfpm & operator=(multi_array_ref_openfpm && other) {
 
-    this->base_ = other.base;
+    this->base_ = other.base_;
     this->storage_ = other.storage_;
     this->extent_list_ = other.extent_list_;
     this->stride_list_ = other.stride_list_;
diff --git a/src/util/ct_array.hpp b/src/util/ct_array.hpp
index 93e21ce1..a66b84b4 100644
--- a/src/util/ct_array.hpp
+++ b/src/util/ct_array.hpp
@@ -18,6 +18,7 @@
 #ifndef CT_ARRAY_HPP_
 #define CT_ARRAY_HPP_
 
+
 #include <boost/fusion/mpl.hpp>
 
 /////////////////////////////////////////////////// Make indexes ///////////////////////
@@ -73,14 +74,14 @@ struct generate_indexes {
 ///////////////////////////////////////////////////
 
 #ifndef COVERTY_SCAN
-#if __CUDACC_VER_MAJOR__ >= 9
 
 //! the array itself
 template<class T, unsigned long... args> struct ArrayHolder_constexpr {
 	//! compile-time array
-    static constexpr T data[sizeof...(args)] = { args... };
+    constexpr static T data[sizeof...(args)] = { args... };
 };
 
+
 //! recursive meta-function to generate compile-time array
 template<class T,size_t N, size_t orig_N, template<size_t,size_t> class F, unsigned... args>
 struct generate_array_constexpr_impl {
@@ -121,7 +122,6 @@ struct generate_array_constexpr {
     typedef typename generate_array_constexpr_impl<T,N-1, N, F>::result result;
 };
 
-#endif
 #endif
 
 //////////////////////////////////////////////////
-- 
GitLab