From 5e99cd3b07f8745c0e3d9228211be49f2f34b754 Mon Sep 17 00:00:00 2001 From: Pietro Incardona <incardon@mpi-cbg.de> Date: Tue, 5 Jun 2018 01:57:22 +0200 Subject: [PATCH] Bringing up GPU grid computation --- src/Grid/gpu_test/cuda_gpu_func.cpp | 97 ++++++++ src/Grid/gpu_test/cuda_grid_unit_tests.cu | 95 ++++++++ src/Grid/gpu_test/cuda_grid_unit_tests.cuh | 19 ++ src/Grid/grid_base_impl_layout.hpp | 5 - src/Grid/grid_base_implementation.hpp | 79 ++++++- src/Grid/grid_gpu.hpp | 190 --------------- src/Grid/grid_key.hpp | 23 +- src/Grid/grid_sm.hpp | 10 +- src/Grid/grid_unit_tests.hpp | 162 ++++++++++++- src/Grid/map_grid.hpp | 261 ++++++++++++++++++++- src/Makefile.am | 6 +- src/Vector/map_vector.hpp | 7 +- src/memory_ly/memory_array.hpp | 10 + src/memory_ly/memory_c.hpp | 114 ++++++++- src/memory_ly/memory_conf.hpp | 4 +- src/util/boost_multi_array_openfpm.hpp | 114 ++++++++- src/util/ct_array.hpp | 6 +- 17 files changed, 975 insertions(+), 227 deletions(-) create mode 100644 src/Grid/gpu_test/cuda_gpu_func.cpp create mode 100644 src/Grid/gpu_test/cuda_grid_unit_tests.cu create mode 100644 src/Grid/gpu_test/cuda_grid_unit_tests.cuh delete mode 100644 src/Grid/grid_gpu.hpp diff --git a/src/Grid/gpu_test/cuda_gpu_func.cpp b/src/Grid/gpu_test/cuda_gpu_func.cpp new file mode 100644 index 00000000..6c89ffed --- /dev/null +++ b/src/Grid/gpu_test/cuda_gpu_func.cpp @@ -0,0 +1,97 @@ +/* + * cuda_gpu_func.cpp + * + * Created on: Jun 3, 2018 + * Author: i-bird + */ + +#include "config.h" +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include "Grid/map_grid.hpp" +#include "Point_test.hpp" + +BOOST_AUTO_TEST_SUITE( grid_gpu_func_test ) + + +BOOST_AUTO_TEST_CASE (gpu_computation_func) +{ +#ifdef CUDA_GPU + + size_t sz[3] = {64,64,64}; + grid_gpu<3, Point_test<float> > c3(sz); + + grid_key_dx<3> k1({1,1,1}); + grid_key_dx<3> k2({62,62,62}); + + c3.setMemory(); + + auto gcf = c3.getGPUIterator(k1,k2); + + BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul); + BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul); + + BOOST_REQUIRE_EQUAL(gcf.wthr.x,4ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul); + + grid_key_dx<3> k3({50,50,50}); + grid_key_dx<3> k4({62,62,62}); + grid_key_dx<3> k5({60,61,62}); + + auto gcf2 = c3.getGPUIterator(k3,k4); + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,16ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,1ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul); + + gcf2 = c3.getGPUIterator(k3,k4,511); + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul); + + gcf2 = c3.getGPUIterator(k3,k4,1); + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,1ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,1ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,1ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,13ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,13ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,13ul); + + gcf2 = c3.getGPUIterator(k3,k5,32); + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,4ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,4ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,2ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,3ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,3ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,7ul); + + gcf2 = c3.getGPUIterator(k3,k5,1); + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,1ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,1ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,1ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,11ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,12ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,13ul); + +#endif +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/Grid/gpu_test/cuda_grid_unit_tests.cu b/src/Grid/gpu_test/cuda_grid_unit_tests.cu new file mode 100644 index 00000000..4218dc74 --- /dev/null +++ b/src/Grid/gpu_test/cuda_grid_unit_tests.cu @@ -0,0 +1,95 @@ +#include "config.h" +#include <Grid/map_grid.hpp> +#include "Point_test.hpp" + +__global__ void compute_stencil_grid(grid_gpu_ker<3,Point_test<float>> g1, grid_gpu_ker<3,Point_test<float>> g2, ite_gpu & ite_gpu) +{ + + GRID_ID_3(ite_gpu); + +/* g2.template get<0>(key) = g1.template get<0>(key.move(0,1)) + g1.template get<0>(key.move(0,-1)) + + g1.template get<0>(key.move(1,1)) + g1.template get<0>(key.move(1,-1)) + + g1.template get<0>(key.move(2,1)) + g1.template get<0>(key.move(2,-1)) - + 6.0*g1.template get<0>(key);*/ + +} + +__global__ void compute_stencil(float * prp_0, float * prp_1, int sz, grid_key_dx<3> start, grid_key_dx<3> stop) +{ + GRID_ID_3_TRAW(start,stop); + + prp_1[tz*sz*sz + ty*sz + tx] = prp_0[tz*sz*sz + ty*sz + tx + 1] + prp_0[tz*sz*sz + ty*sz + tx - 1] + + prp_0[tz*sz*sz + (ty + 1)*sz + tx] + prp_0[tz*sz*sz + (ty - 1)*sz + tx] + + prp_0[(tz + 1)*sz*sz + ty*sz + tx + 1] + prp_0[(tz - 1)*sz*sz + ty*sz + tx - 1] - + 6.0*prp_0[tz*sz*sz + ty*sz + tx]; +} + +__global__ void fill_one(float * prp_0,int sz) +{ + // Thread index + int tx = threadIdx.x + blockIdx.x * blockDim.x; + int ty = threadIdx.y + blockIdx.y * blockDim.y; + int tz = threadIdx.z + blockIdx.z * blockDim.z; + + prp_0[tz*sz*sz + ty*sz + tx] = 1.0f; +} + +__global__ void fill_count(float * prp_0,int sz) +{ + // Thread index + int tx = threadIdx.x + blockIdx.x * blockDim.x; + int ty = threadIdx.y + blockIdx.y * blockDim.y; + int tz = threadIdx.z + blockIdx.z * blockDim.z; + + prp_0[tz*sz*sz + ty*sz + tx] = tz*sz*sz + ty*sz + tx; +} + +// call compute + +void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g) +{ + // Setup execution parameters + dim3 threads(8,8,8); + dim3 grid(8,8,8); + + float * prp_0 = (float *)g.getDeviceBuffer<0>(); + + fill_one<<< grid, threads >>>(prp_0,64); +} + +// call compute + +void gpu_grid_3D_compute(grid_gpu<3,Point_test<float>> & g) +{ + // Setup execution parameters + dim3 threads(8,8,8); + dim3 grid(8,8,8); + + float * prp_0 = (float *)g.getDeviceBuffer<0>(); + + fill_count<<< grid, threads >>>(prp_0,64); +} + +void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2, + grid_key_dx<3> & start, grid_key_dx<3> & stop) +{ + // Setup execution parameters + + float * prp_0 = (float *)g1.getDeviceBuffer<0>(); + float * prp_1 = (float *)g2.getDeviceBuffer<0>(); + + auto gpu_it = g2.getGPUIterator(start,stop); + + compute_stencil<<< gpu_it.thr, gpu_it.wthr >>>(prp_0,prp_1,64,start,stop); +} + +void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2, + grid_key_dx<3> & start, grid_key_dx<3> & stop) +{ + auto gpu_it = g2.getGPUIterator(start,stop); + + auto g1k = g1.toGPU(); + + compute_stencil_grid<<< gpu_it.thr, gpu_it.wthr >>>(g1k,g1k,gpu_it); +} + diff --git a/src/Grid/gpu_test/cuda_grid_unit_tests.cuh b/src/Grid/gpu_test/cuda_grid_unit_tests.cuh new file mode 100644 index 00000000..98f068b8 --- /dev/null +++ b/src/Grid/gpu_test/cuda_grid_unit_tests.cuh @@ -0,0 +1,19 @@ +/* + * cuda_gpu_compute.cuh + * + * Created on: Sep 29, 2017 + * Author: i-bird + */ + +#ifndef OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_ +#define OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_ + + +void gpu_grid_3D_compute(grid_gpu<3,Point_test<float>> & g); +void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2, + grid_key_dx<3> & key1, grid_key_dx<3> & key2); +void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g); +void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2, + grid_key_dx<3> & start, grid_key_dx<3> & stop); + +#endif /* OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_ */ diff --git a/src/Grid/grid_base_impl_layout.hpp b/src/Grid/grid_base_impl_layout.hpp index 2eb3b31c..8fe5f29d 100644 --- a/src/Grid/grid_base_impl_layout.hpp +++ b/src/Grid/grid_base_impl_layout.hpp @@ -68,11 +68,6 @@ struct frswap } }; -#ifdef __NVCC__ -#else -#define __host__ -#define __device__ -#endif //! Case memory_traits_lin template<unsigned int p, typename layout, typename data_type, typename g1_type, typename key_type, unsigned int sel = 2*is_layout_mlin<layout>::value + is_layout_inte<layout>::value > diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp index 1482644d..6f52caf8 100644 --- a/src/Grid/grid_base_implementation.hpp +++ b/src/Grid/grid_base_implementation.hpp @@ -10,7 +10,45 @@ #include "grid_base_impl_layout.hpp" -#ifdef __NVCC__ + +#ifdef CUDA_GPU + +#ifndef __NVCC__ +#undef __host__ +#undef __device__ +#include <vector_types.h> +#endif + +#define GRID_ID_3_RAW(start,stop) int x[3] = {threadIdx.x + blockIdx.x * blockDim.x + start.get(0),\ + threadIdx.y + blockIdx.y * blockDim.y + start.get(1),\ + threadIdx.z + blockIdx.z * blockDim.z + start.get(2)};\ + \ + if (x[0] > stop.get(0) || x[1] > stop.get(1) || x[2] > stop.get(2))\ + {return;} + +#define GRID_ID_3_TRAW(start,stop) int tx = threadIdx.x + blockIdx.x * blockDim.x + start.get(0);\ + int ty = threadIdx.y + blockIdx.y * blockDim.y + start.get(1);\ + int tz = threadIdx.z + blockIdx.z * blockDim.z + start.get(2);\ + \ + if (tx > stop.get(0) || ty > stop.get(1) || tz > stop.get(2))\ + {return;} + +#define GRID_ID_3(ite_gpu) grid_key_dx<3> key;\ + key.set_d(0,threadIdx.x + blockIdx.x * blockDim.x + ite_gpu.start.get(0));\ + key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\ + key.set_d(2,threadIdx.z + blockIdx.z * blockDim.z + ite_gpu.start.get(2));\ + \ + if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\ + {return;} + +struct ite_gpu +{ + dim3 thr; + dim3 wthr; + + grid_key_dx<3> start; + grid_key_dx<3> stop; +}; #else #define __host__ #define __device__ @@ -328,6 +366,45 @@ public: return grid_new; } +#ifdef CUDA_GPU + struct ite_gpu getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) + { + size_t n = n_thr; + + // Work to do + ite_gpu ig; + + ig.thr.x = 1; + ig.thr.y = 1; + ig.thr.z = 1; + + int dir = 0; + + while (n != 1) + { + if (dir % 3 == 0) + {ig.thr.x = ig.thr.x << 1;} + else if (dir % 3 == 1) + {ig.thr.y = ig.thr.y << 1;} + else if (dir % 3 == 2) + {ig.thr.z = ig.thr.z << 1;} + + n = n >> 1; + + dir++; + } + + ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0); + ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0); + ig.wthr.z = (key2.get(2) - key1.get(2) + 1) / ig.thr.z + (((key2.get(2) - key1.get(2) + 1)%ig.thr.z != 0)?1:0); + + ig.start = key1; + ig.stop = key2; + + return ig; + } +#endif + /*! \brief Return the internal grid information * * Return the internal grid information diff --git a/src/Grid/grid_gpu.hpp b/src/Grid/grid_gpu.hpp deleted file mode 100644 index 9c59b641..00000000 --- a/src/Grid/grid_gpu.hpp +++ /dev/null @@ -1,190 +0,0 @@ -/* - * grid_gpu.hpp - * - * Created on: Oct 31, 2015 - * Author: i-bird - */ - -#ifndef OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_ -#define OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_ - -/*! \brief this class is a functor for "for_each" algorithm - * - * This class is a functor for "for_each" algorithm. For each - * element of the boost::vector the operator() is called. - * Is mainly used to copy one encap into another encap object - * - * \tparam encap source - * \tparam encap dst - * - */ - -template<typename T_type> -struct copy_memory_c -{ - //! encapsulated source object - const typename memory_traits_inte<T_type>::type & src; - //! encapsulated destination object - typename memory_traits_inte_red<T_type>::type & dst; - - - /*! \brief constructor - * - * \param src source encapsulated object - * \param dst source encapsulated object - * - */ - inline copy_memory_c(const typename memory_traits_inte<T_type>::type & src, - typename memory_traits_inte_red<T_type>::type & dst) - :src(src),dst(dst) - { - }; - - - //! It call the copy function for each property - template<typename T> - inline void operator()(T& t) const - { - boost::fusion::at_c<T::value>(dst).mem_r = boost::fusion::at_c<T::value>(src).mem_r; - } -}; - -struct dim3_ -{ - //! size in x dimension - unsigned int x; - - //! size in y dimension - unsigned int y; - - //! size in z dimension - unsigned int z; -}; - -template<unsigned int dim> -struct device_grid -{ - //! number of treads in each block - dim3_ threads; - - //! number of grid for the kernel execution - dim3_ grids; -}; - - -/*! \brief This is an N-dimensional grid or an N-dimensional array with memory_traits_inte layout - * - * it is basically an N-dimensional Cartesian grid - * - * \tparam dim Dimensionality of the grid - * \tparam T type of object the grid store - * \tparam Mem memory layout - * - * ### Definition and allocation of a 3D grid on GPU memory - * \snippet grid_unit_tests.hpp Definition and allocation of a 3D grid on GPU memory - * ### Access a grid c3 of size sz on each direction - * \snippet grid_unit_tests.hpp Access a grid c3 of size sz on each direction - * ### Access to an N-dimensional grid with an iterator - * \snippet grid_unit_tests.hpp Access to an N-dimensional grid with an iterator - * - */ -template<unsigned int dim, typename T, typename S> -class grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> : public grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte> -{ - //! grid layout - typedef typename memory_traits_inte<T>::type layout; - -public: - - //! Object container for T, it is the return type of get_o it return a object type trough - // you can access all the properties of T - typedef typename grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::container container; - - //! Default constructor - inline grid_cpu() THROW - :grid_base_impl<dim,T,S,layout,memory_traits_inte>() - { - } - - /*! \brief create a grid from another grid - * - * \param g the grid to copy - * - */ - inline grid_cpu(const grid_cpu & g) THROW - :grid_base_impl<dim,T,S,layout,memory_traits_inte>(g) - { - } - - /*! \brief create a grid of size sz on each direction - * - * \param sz grid size in each direction - * - */ - inline grid_cpu(const size_t & sz) THROW - :grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz) - { - } - - //! Constructor allocate memory and give them a representation - inline grid_cpu(const size_t (& sz)[dim]) THROW - :grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz) - { - } - - /*! \brief It return the properties arrays. - * - * In case of Cuda memory it return the device pointers to pass to the kernels - * - * - */ - template<unsigned int id> void * getDeviceBuffer() - { - return boost::fusion::at_c<id>(this->data_).mem->getDevicePointer(); - } - - /*! \brief Synchronize the memory buffer in the device with the memory in the host - * - * - */ - template<unsigned int id> void deviceToHost() - { - return boost::fusion::at_c<id>(this->data_).mem->deviceToHost(); - } - - /*! \brief Convert the grid into a data-structure compatible for computing into GPU - * - * The object created can be considered like a reference of the original - * - */ - grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> toGPU() - { - grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> g; - copy_memory_c<T> cp_mc(this->data_,g.data_); - - boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(cp_mc); - - return g; - } - - /*! \brief When we switch to GPU mode the data-structure can be safely given to - * a kernel for computation - * - * - */ - void switchToGPU() - { - - } - - void switchToCPU() - { - - } -}; - -//! short formula for a grid on gpu -template <unsigned int dim, typename T> using grid_gpu = grid_cpu<dim,T,CudaMemory,typename memory_traits_inte<T>::type>; - - -#endif /* OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_ */ diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp index 875f24fb..33225037 100644 --- a/src/Grid/grid_key.hpp +++ b/src/Grid/grid_key.hpp @@ -31,7 +31,7 @@ public: } //! Constructor - inline grid_key_dx() + __device__ __host__ inline grid_key_dx() {} /*! \brief Constructor from initializer list @@ -56,7 +56,7 @@ public: * \param key copy constructor * */ - inline grid_key_dx(const grid_key_dx<dim> & key) + __device__ __host__ inline grid_key_dx(const grid_key_dx<dim> & key) :grid_key_dx(key.k) { } @@ -66,7 +66,7 @@ public: * \param k reference buffer * */ - inline grid_key_dx(const size_t (&k)[dim]) + __device__ __host__ inline grid_key_dx(const size_t (&k)[dim]) { for (size_t i = 0 ; i < dim ; i++) this->k[i] = k[i]; @@ -77,7 +77,7 @@ public: * \param k reference buffer * */ - inline grid_key_dx(const long int (&k)[dim]) + __device__ __host__ inline grid_key_dx(const long int (&k)[dim]) { for (size_t i = 0 ; i < dim ; i++) this->k[i] = k[i]; @@ -110,6 +110,15 @@ public: invert_assign(t...); } + __device__ __host__ inline grid_key_dx<dim> move(int i, int m) + { + grid_key_dx<dim> tmp = *this; + + tmp.k[i] += m; + + return tmp; + } + /*! \brief Set to zero the key * */ @@ -380,7 +389,7 @@ public: * \return the index value * */ - mem_id get(size_t i) const + __device__ __host__ mem_id get(size_t i) const { return k[i]; } @@ -393,9 +402,9 @@ public: * \param id value to set * */ - void set_d(size_t i, mem_id id) + __device__ __host__ void set_d(size_t i, mem_id id) { -#ifdef DEBUG +#ifdef SE_CLASS1 if (i >= dim) std::cerr << "grid_key_dx error: " << __FILE__ << " " << __LINE__ << " try to access dimension " << i << " on a grid_key_dx of size " << dim << "\n"; diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp index 6ee9c259..40637246 100755 --- a/src/Grid/grid_sm.hpp +++ b/src/Grid/grid_sm.hpp @@ -295,7 +295,8 @@ public: * \return The linearization of the gk key shifted by c, or -1 if the check fail */ - template<typename check=NoCheck> inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N]) const + template<typename check=NoCheck> + inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N]) const { mem_id lid; @@ -330,7 +331,8 @@ public: * \return The linearization of the gk key shifted by c, or -1 if the check fail */ - template<typename check=NoCheck> inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N], const size_t (&bc)[N]) const + template<typename check=NoCheck> + inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N], const size_t (&bc)[N]) const { mem_id lid; @@ -397,7 +399,7 @@ public: * */ - inline mem_id LinId(const size_t (& k)[N]) const + __device__ __host__ inline mem_id LinId(const size_t (& k)[N]) const { mem_id lid = k[0]; for (mem_id i = 1 ; i < N ; i++) @@ -418,7 +420,7 @@ public: * */ - inline mem_id LinId(const grid_key_dx<N> & gk) const + __device__ __host__ inline mem_id LinId(const grid_key_dx<N> & gk) const { mem_id lid = gk.k[0]; for (mem_id i = 1 ; i < N ; i++) diff --git a/src/Grid/grid_unit_tests.hpp b/src/Grid/grid_unit_tests.hpp index 49a95eb7..9e41a165 100644 --- a/src/Grid/grid_unit_tests.hpp +++ b/src/Grid/grid_unit_tests.hpp @@ -7,7 +7,7 @@ #include "Space/Shape/HyperCube.hpp" #include "timer.hpp" #include "grid_util_test.hpp" -#include "cuda_gpu_compute.cuh" +#include "gpu_test/cuda_grid_unit_tests.cuh" #ifdef TEST_COVERAGE_MODE #define GS_SIZE 8 @@ -784,7 +784,165 @@ BOOST_AUTO_TEST_CASE (gpu_computation) BOOST_REQUIRE_EQUAL(good,true); - gpu_grid_3D_fill(c3); + } + + #endif +} + +BOOST_AUTO_TEST_CASE (gpu_computation_stencil) +{ + #ifdef CUDA_GPU + + { + size_t sz[3] = {64,64,64}; + grid_gpu<3, Point_test<float> > c3(sz); + grid_gpu<3, Point_test<float> > c2(sz); + grid_key_dx<3> key1({1,1,1}); + grid_key_dx<3> key2({62,62,62}); + + + c3.setMemory(); + c2.setMemory(); + test_layout_gridNd<3>(c3,sz[0]); + test_layout_gridNd<3>(c2,sz[0]); + + gpu_grid_3D_one(c2); + + // Check property 1 is 1.0 + c2.deviceToHost<0>(); + + { + auto it = c2.getIterator(); + + bool good = true; + while(it.isNext()) + { + auto key = it.get(); + + good &= c2.get<0>(key) == 1.0; + + ++it; + } + + BOOST_REQUIRE_EQUAL(good,true); + } + + gpu_grid_3D_compute(c3); + c3.deviceToHost<0>(); + + { + auto it = c3.getIterator(); + + bool good = true; + while(it.isNext()) + { + auto key = it.get(); + + good &= c3.getGrid().LinId(key) == c3.get<0>(key); + + ++it; + } + + BOOST_REQUIRE_EQUAL(good,true); + } + + gpu_grid_3D_compute_stencil(c3,c2,key1,key2); + + c2.deviceToHost<0>(); + + auto it = c2.getIterator(key1,key2); + + bool good = true; + while(it.isNext()) + { + auto key = it.get(); + + good &= c2.get<0>(key) == 0; + + ++it; + } + + BOOST_REQUIRE_EQUAL(good,true); + + } + + #endif +} + +BOOST_AUTO_TEST_CASE (gpu_computation_grid_stencil) +{ + #ifdef CUDA_GPU + + { + size_t sz[3] = {64,64,64}; + grid_gpu<3, Point_test<float> > c3(sz); + grid_gpu<3, Point_test<float> > c2(sz); + grid_key_dx<3> key1({1,1,1}); + grid_key_dx<3> key2({62,62,62}); + + + c3.setMemory(); + c2.setMemory(); + test_layout_gridNd<3>(c3,sz[0]); + test_layout_gridNd<3>(c2,sz[0]); + + gpu_grid_3D_one(c2); + + // Check property 1 is 1.0 + c2.deviceToHost<0>(); + + { + auto it = c2.getIterator(); + + bool good = true; + while(it.isNext()) + { + auto key = it.get(); + + good &= c2.get<0>(key) == 1.0; + + ++it; + } + + BOOST_REQUIRE_EQUAL(good,true); + } + + gpu_grid_3D_compute(c3); + c3.deviceToHost<0>(); + + { + auto it = c3.getIterator(); + + bool good = true; + while(it.isNext()) + { + auto key = it.get(); + + good &= c3.getGrid().LinId(key) == c3.get<0>(key); + + ++it; + } + + BOOST_REQUIRE_EQUAL(good,true); + } + + gpu_grid_3D_compute_grid_stencil(c3,c2,key1,key2); + + c2.deviceToHost<0>(); + + auto it = c2.getIterator(key1,key2); + + bool good = true; + while(it.isNext()) + { + auto key = it.get(); + + good &= c2.get<0>(key) == 0; + + ++it; + } + + BOOST_REQUIRE_EQUAL(good,true); } diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp index 185d538d..3ae30d18 100755 --- a/src/Grid/map_grid.hpp +++ b/src/Grid/map_grid.hpp @@ -3,6 +3,12 @@ #include "config.h" +#ifndef CUDA_GPU +#include <boost/config/compiler/nvcc.hpp> +#define BOOST_FUSION_GPU_ENABLED +#define BOOST_GPU_ENABLED +#endif + //! Warning: apparently you cannot used nested boost::mpl with boost::fusion //! can create template circularity, this include avoid the problem #include "util/object_util.hpp" @@ -58,6 +64,7 @@ typedef HeapMemory CudaMemory; #endif + /*! Stub grid class * */ @@ -173,7 +180,259 @@ public: } }; -#include "grid_gpu.hpp" + +/*! \brief this class is a functor for "for_each" algorithm + * + * This class is a functor for "for_each" algorithm. For each + * element of the boost::vector the operator() is called. + * Is mainly used to copy one encap into another encap object + * + * \tparam encap source + * \tparam encap dst + * + */ + +template<typename T_type> +struct copy_switch_memory_c +{ + //! encapsulated source object + const typename memory_traits_inte<T_type>::type & src; + //! encapsulated destination object + typename memory_traits_inte<T_type>::type & dst; + + + /*! \brief constructor + * + * \param src source encapsulated object + * \param dst source encapsulated object + * + */ + inline copy_switch_memory_c(const typename memory_traits_inte<T_type>::type & src, + typename memory_traits_inte<T_type>::type & dst) + :src(src),dst(dst) + { + }; + + + //! It call the copy function for each property + template<typename T> + inline void operator()(T& t) const + { + boost::fusion::at_c<T::value>(dst).mem = boost::fusion::at_c<T::value>(src).mem; + boost::fusion::at_c<T::value>(dst).mem_r.bind_ref(boost::fusion::at_c<T::value>(src).mem_r); + boost::fusion::at_c<T::value>(dst).switchToDevicePtr(); + } +}; + +/*! \brief grid interface available when on gpu + * + * \tparam n_buf number of template buffers + * + */ + +template<unsigned int dim, typename T> +struct grid_gpu_ker +{ + //! grid information + grid_sm<dim,void> g1; + + //! type of layout of the structure + typedef typename memory_traits_inte<T>::type layout; + + //! layout data + layout data_; + + grid_gpu_ker() + {} + + grid_gpu_ker(const grid_gpu_ker & cpy) + :g1(cpy.g1) + { + copy_switch_memory_c<T> bp_mc(cpy.data_,this->data_); + + boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(bp_mc); + } + + /*! \brief Get the reference of the selected element + * + * \param v1 grid_key that identify the element in the grid + * + * \return the reference of the element + * + */ + template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get(data_,g1,grid_key_dx<dim>()))> + __device__ inline r_type get(const grid_key_dx<dim> & v1) + { + return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),decltype(v1)>::get(data_,g1,v1); + } + + /*! \brief Get the const reference of the selected element + * + * \param v1 grid_key that identify the element in the grid + * + * \return the const reference of the element + * + */ + template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get(data_,g1,grid_key_dx<dim>()))> + __device__ inline const r_type get(const grid_key_dx<dim> & v1) const + { + return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),decltype(v1)>::get(data_,g1,v1); + } + + /*! \brief Get the reference of the selected element + * + * \param lin_id linearized element that identify the element in the grid + * + * \return the reference of the element + * + */ + template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get_lin(data_,g1,0))> + __device__ inline r_type get(const size_t lin_id) + { + return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),grid_key_dx<dim>>::get_lin(data_,g1,lin_id); + } + + /*! \brief Get the const reference of the selected element + * + * \param lin_id linearized element that identify the element in the grid + * + * \return the const reference of the element + * + */ + template <unsigned int p, typename r_type=decltype(mem_get<p,memory_traits_inte<T>,layout,grid_sm<dim,T>,grid_key_dx<dim>>::get_lin(data_,g1,0))> + __device__ inline const r_type get(size_t lin_id) const + { + return mem_get<p,memory_traits_inte<T>,decltype(this->data_),decltype(this->g1),grid_key_dx<dim>>::get_lin(data_,g1,lin_id); + } +}; + +struct dim3_ +{ + //! size in x dimension + unsigned int x; + + //! size in y dimension + unsigned int y; + + //! size in z dimension + unsigned int z; +}; + +template<unsigned int dim> +struct device_grid +{ + //! number of treads in each block + dim3_ threads; + + //! number of grid for the kernel execution + dim3_ grids; +}; + + +/*! \brief This is an N-dimensional grid or an N-dimensional array with memory_traits_inte layout + * + * it is basically an N-dimensional Cartesian grid + * + * \tparam dim Dimensionality of the grid + * \tparam T type of object the grid store + * \tparam Mem memory layout + * + * ### Definition and allocation of a 3D grid on GPU memory + * \snippet grid_unit_tests.hpp Definition and allocation of a 3D grid on GPU memory + * ### Access a grid c3 of size sz on each direction + * \snippet grid_unit_tests.hpp Access a grid c3 of size sz on each direction + * ### Access to an N-dimensional grid with an iterator + * \snippet grid_unit_tests.hpp Access to an N-dimensional grid with an iterator + * + */ +template<unsigned int dim, typename T, typename S> +class grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> : public grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte> +{ + //! grid layout + typedef typename memory_traits_inte<T>::type layout; + +public: + + //! Object container for T, it is the return type of get_o it return a object type trough + // you can access all the properties of T + typedef typename grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::container container; + + //! Default constructor + inline grid_cpu() THROW + :grid_base_impl<dim,T,S,layout,memory_traits_inte>() + { + } + + /*! \brief create a grid from another grid + * + * \param g the grid to copy + * + */ + inline grid_cpu(const grid_cpu & g) THROW + :grid_base_impl<dim,T,S,layout,memory_traits_inte>(g) + { + } + + /*! \brief create a grid of size sz on each direction + * + * \param sz grid size in each direction + * + */ + inline grid_cpu(const size_t & sz) THROW + :grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz) + { + } + + //! Constructor allocate memory and give them a representation + inline grid_cpu(const size_t (& sz)[dim]) THROW + :grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz) + { + } + + /*! \brief It return the properties arrays. + * + * In case of Cuda memory it return the device pointers to pass to the kernels + * + * + */ + template<unsigned int id> void * getDeviceBuffer() + { + return boost::fusion::at_c<id>(this->data_).mem->getDevicePointer(); + } + + /*! \brief Synchronize the memory buffer in the device with the memory in the host + * + * + */ + template<unsigned int id> void deviceToHost() + { + return boost::fusion::at_c<id>(this->data_).mem->deviceToHost(); + } + + + +#ifdef CUDA_GPU + + /*! \brief Convert the grid into a data-structure compatible for computing into GPU + * + * The object created can be considered like a reference of the original + * + */ + grid_gpu_ker<dim,T> toGPU() + { + grid_gpu_ker<dim,T> g; + copy_switch_memory_c<T> cp_mc(this->data_,g.data_); + + boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(cp_mc); + + return g; + } + +#endif +}; + +//! short formula for a grid on gpu +template <unsigned int dim, typename T> using grid_gpu = grid_cpu<dim,T,CudaMemory,typename memory_traits_inte<T>::type>; #endif diff --git a/src/Makefile.am b/src/Makefile.am index 598077cc..ba342189 100755 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,7 +1,7 @@ LINKLIBS = $(LIBHILBERT_LIB) $(PTHREAD_LIBS) $(OPT_LIBS) $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) $(CUDA_LIBS) $(BOOST_IOSTREAMS_LIB) $(LOCAL_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(BOOST_CHRONO_LIB) $(BOOST_TIMER_LIB) $(BOOST_SYSTEM_LIB) -NVCCFLAGS:=$(NVCCFLAGS) -ccbin=g++-5.4.0 --std=c++11 +NVCCFLAGS:=$(NVCCFLAGS) -g --std=c++11 if BUILDCUDA CUDA_SOURCES=../../openfpm_devices/src/memory/CudaMemory.cu @@ -10,14 +10,14 @@ else endif noinst_PROGRAMS = mem_map -mem_map_SOURCES = ../../openfpm_devices/src/Memleak_check.cpp main.cpp $(CUDA_SOURCES) ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp Grid/cuda_gpu_compute.cu +mem_map_SOURCES = ../../openfpm_devices/src/Memleak_check.cpp main.cpp Grid/gpu_test/cuda_gpu_func.cpp $(CUDA_SOURCES) ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp Grid/gpu_test/cuda_grid_unit_tests.cu mem_map_CXXFLAGS = $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I/usr/local/include -I/usr/local/libhilbert/include mem_map_CFLAGS = $(CUDA_CFLAGS) mem_map_LDADD = $(LINKLIBS) nobase_include_HEADERS= data_type/scalar.hpp data_type/aggregate.hpp \ Graph/graph_unit_tests.hpp Graph/map_graph.hpp \ -Grid/comb.hpp Grid/grid_base_implementation.hpp Grid/grid_pack_unpack.ipp Grid/grid_base_impl_layout.hpp Grid/grid_common.hpp Grid/grid_gpu.hpp Grid/Encap.hpp Grid/grid_key.hpp Grid/grid_key_dx_expression_unit_tests.hpp Grid/grid_key_expression.hpp Grid/grid_sm.hpp Grid/grid_unit_tests.hpp Grid/grid_util_test.hpp Grid/map_grid.hpp Grid/se_grid.hpp Grid/util.hpp \ +Grid/comb.hpp Grid/grid_base_implementation.hpp Grid/grid_pack_unpack.ipp Grid/grid_base_impl_layout.hpp Grid/grid_common.hpp Grid/Encap.hpp Grid/grid_key.hpp Grid/grid_key_dx_expression_unit_tests.hpp Grid/grid_key_expression.hpp Grid/grid_sm.hpp Grid/grid_unit_tests.hpp Grid/grid_util_test.hpp Grid/map_grid.hpp Grid/se_grid.hpp Grid/util.hpp \ Grid/iterators/grid_key_dx_iterator_sp.hpp Grid/grid_key_dx_iterator_hilbert.hpp Grid/iterators/stencil_type.hpp Grid/iterators/grid_key_dx_iterator_sub_bc.hpp Grid/iterators/grid_key_dx_iterator_sub.hpp Grid/iterators/grid_key_dx_iterator.hpp Grid/iterators/grid_skin_iterator.hpp \ Point_test.hpp \ Point_orig.hpp \ diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp index f7200ff1..3ec5727d 100644 --- a/src/Vector/map_vector.hpp +++ b/src/Vector/map_vector.hpp @@ -653,8 +653,8 @@ namespace openfpm * */ - template <unsigned int p, typename r_type=decltype(std::declval<const grid_cpu<1,T,Memory,layout>>().template get<p>(grid_key_dx<1>(0)))> - inline const r_type get(size_t id) const + template <unsigned int p> + inline auto get(size_t id) const -> decltype(base.template get<p>(grid_key_dx<1>(0))) { #ifdef SE_CLASS2 check_valid(this,8); @@ -763,7 +763,8 @@ namespace openfpm * */ - template <unsigned int p, typename r_type=decltype(std::declval<grid_cpu<1,T,Memory,layout>>().template get<p>(grid_key_dx<1>(0)))> inline r_type get(size_t id) + template <unsigned int p> + inline auto get(size_t id) -> decltype(base.template get<p>(grid_key_dx<1>(0))) { #ifdef SE_CLASS2 check_valid(this,8); diff --git a/src/memory_ly/memory_array.hpp b/src/memory_ly/memory_array.hpp index 2e8e9256..c1b7f036 100644 --- a/src/memory_ly/memory_array.hpp +++ b/src/memory_ly/memory_array.hpp @@ -98,6 +98,16 @@ class memory_array return ptr; } + /*! \brief Set from another memory array + * + * \param the other memory_array + * + */ + void bind_ref(const memory_array<T> & ref) + { + ptr = ref.ptr; + } + /*! \brief Access element an element of the array * * \param i element to access diff --git a/src/memory_ly/memory_c.hpp b/src/memory_ly/memory_c.hpp index c4c7bb16..92cc4c08 100644 --- a/src/memory_ly/memory_c.hpp +++ b/src/memory_ly/memory_c.hpp @@ -76,6 +76,23 @@ class memory_c<T,MEMORY_C_STANDARD,D> this->mem = &mem; } + /*! \brief This function bind the memory_c to this memory_c as reference + * + * Bind ad reference it mean this this object does not create new memory but use the one from ref + * as a reference. + * + */ + bool bind_ref(const memory_c<T,MEMORY_C_STANDARD,D> & ref) + { + mem = ref.mem; + mem->incRef(); + + //! we create the representation for the memory buffer + mem_r = ref.mem_r; + + return true; + } + /*! \brief This function get the object that allocate memory * * \return memory object to allocate memory @@ -87,6 +104,14 @@ class memory_c<T,MEMORY_C_STANDARD,D> return *this->mem; } + /*! \brief Switch the pointer to device pointer + * + */ + void switchToDevicePtr() + { + mem_r.set_pointer(mem->getDevicePointer()); + } + /*! \brief This function get the object that allocate memory * * \return memory object to allocate memory @@ -276,6 +301,52 @@ static inline std::array<size_type ,dim> zero_dims() return dimensions; } +template<unsigned int dim, typename size_type> +struct array_ord +{ +}; + +template<typename size_type> +struct array_ord<4,size_type> +{ + static constexpr size_type data[4] = {0,3,2,1}; +}; + +template<typename size_type> +struct array_ord<3,size_type> +{ + static constexpr size_type data[3] = {0,2,1}; +}; + +template<typename size_type> +struct array_ord<2,size_type> +{ + static constexpr size_type data[2] = {0,1}; +}; + +template<unsigned int dim> +struct array_asc +{ +}; + +template<> +struct array_asc<4> +{ + static constexpr bool data[4] = {true,true,true,true}; +}; + +template<> +struct array_asc<3> +{ + static constexpr bool data[3] = {true,true,true}; +}; + +template<> +struct array_asc<2> +{ + static constexpr bool data[2] = {true,true}; +}; + /*! \brief Specialization of memory_c for multi_array * @@ -331,12 +402,21 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D> //! define size_type typedef typename boost::multi_array_ref<base,size_p::value>::size_type size_type; +//#ifdef __NVCC__ + + array_ord<size_p::value,typename boost::multi_array<T,size_p::value>::size_type> ord; + + array_asc<size_p::value> asc; + +//#else + //! we generate the ordering buffer ord::data = {0,N-1 ...... 1 } - typedef typename generate_array<typename boost::multi_array<T,size_p::value>::size_type,size_p::value, ordering>::result ord; +// typedef typename generate_array<typename boost::multi_array<T,size_p::value>::size_type,size_p::value, ordering>::result ord; // we generate the ascending buffer - typedef typename generate_array<bool,size_p::value, ascending>::result asc; +// typedef typename generate_array<bool,size_p::value, ascending>::result asc; +//#endif public: @@ -370,6 +450,31 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D> return *this->mem; } + /*! \brief Switch the pointer to device pointer + * + */ + void switchToDevicePtr() + { + mem_r.set_pointer(mem->getDevicePointer()); + } + + /*! \brief This function bind the memory_c to this memory_c as reference + * + * Bind ad reference it mean this this object does not create new memory but use the one from ref + * as a reference. + * + */ + bool bind_ref(const memory_c<multi_array<T>, MEMORY_C_STANDARD, D> & ref) + { + mem = ref.mem; + mem->incRef(); + + //! we create the representation for the memory buffer + mem_r = ref.mem_r; + + return true; + } + /*! \brief This function allocate memory and associate the representation to mem_r * * This function allocate memory and associate the representation of that chunk of @@ -394,7 +499,7 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D> for (int i = 0 ; i < size_p::value-1 ; i++) {dimensions[i+1] = dims::data[i];} - boost::multi_array_ref_openfpm<base,size_p::value> tmp(static_cast<base *>(mem->getPointer()),dimensions,boost::general_storage_order<size_p::value>(ord::data,asc::data)); + boost::multi_array_ref_openfpm<base,size_p::value> tmp(static_cast<base *>(mem->getPointer()),dimensions,boost::general_storage_order<size_p::value>(boost::ofp_storage_order())); //! we create the representation for the memory buffer mem_r.swap(tmp); @@ -415,7 +520,8 @@ class memory_c<multi_array<T>, MEMORY_C_STANDARD, D> //! constructor memory_c() :mem(NULL), - mem_r(static_cast<base *>(NULL),zero_dims<size_type,size_p::value>(),boost::general_storage_order<size_p::value>(ord::data,asc::data)) + mem_r(static_cast<base *>(NULL),zero_dims<size_type,size_p::value>(), + boost::ofp_storage_order()) {} //! destructor diff --git a/src/memory_ly/memory_conf.hpp b/src/memory_ly/memory_conf.hpp index 41bdfdde..1553523d 100644 --- a/src/memory_ly/memory_conf.hpp +++ b/src/memory_ly/memory_conf.hpp @@ -94,7 +94,7 @@ struct memory_traits_inte * * */ -template<typename T> +/*template<typename T> struct memory_traits_inte_red { //! for each element in the vector interleave memory_c @@ -102,7 +102,7 @@ struct memory_traits_inte_red //! indicate that it change the memory layout from the original typedef int yes_is_inte; -}; +};*/ /*! \brief small meta-function to get the type of the memory * diff --git a/src/util/boost_multi_array_openfpm.hpp b/src/util/boost_multi_array_openfpm.hpp index f72bb5df..8aa9b899 100644 --- a/src/util/boost_multi_array_openfpm.hpp +++ b/src/util/boost_multi_array_openfpm.hpp @@ -42,6 +42,90 @@ namespace boost { + // RG - This is to make things work with VC++. So sad, so sad. + class c_storage_order; + class fortran_storage_order; + class ofp_storage_order; + + template <std::size_t NumDims> + class general_storage_order_ofp + { + public: + typedef detail::multi_array::size_type size_type; + template <typename OrderingIter, typename AscendingIter> + general_storage_order_ofp(OrderingIter ordering, + AscendingIter ascending) { + boost::detail::multi_array::copy_n(ordering,NumDims,ordering_.begin()); + boost::detail::multi_array::copy_n(ascending,NumDims,ascending_.begin()); + } + + // RG - ideally these would not be necessary, but some compilers + // don't like template conversion operators. I suspect that not + // too many folk will feel the need to use customized + // storage_order objects, I sacrifice that feature for compiler support. + general_storage_order_ofp(const c_storage_order&) { + for (size_type i=0; i != NumDims; ++i) { + ordering_[i] = NumDims - 1 - i; + } + ascending_.assign(true); + } + + general_storage_order_ofp(const fortran_storage_order&) { + for (size_type i=0; i != NumDims; ++i) { + ordering_[i] = i; + } + ascending_.assign(true); + } + + general_storage_order_ofp(const ofp_storage_order&) { + ordering_[NumDims - 1] = 0; + + for (size_type i=0; i != NumDims; ++i) { + ordering_[i] = i + 1; + } + ascending_.assign(true); + } + + size_type ordering(size_type dim) const { return ordering_[dim]; } + bool ascending(size_type dim) const { return ascending_[dim]; } + + bool all_dims_ascending() const { + return std::accumulate(ascending_.begin(),ascending_.end(),true, + std::logical_and<bool>()); + } + + bool operator==(general_storage_order_ofp const& rhs) const { + return (ordering_ == rhs.ordering_) && + (ascending_ == rhs.ascending_); + } + + protected: + boost::array<size_type,NumDims> ordering_; + boost::array<bool,NumDims> ascending_; + }; + + class ofp_storage_order + { + typedef detail::multi_array::size_type size_type; + public: + // This is the idiom for creating your own custom storage orders. + // Not supported by all compilers though! +#ifndef __MWERKS__ // Metrowerks screams "ambiguity!" + template <std::size_t NumDims> + operator general_storage_order<NumDims>() const { + boost::array<size_type,NumDims> ordering; + boost::array<bool,NumDims> ascending; + + for (size_type i=0; i != NumDims; ++i) { + ordering[i] = NumDims - 1 - i; + ascending[i] = true; + } + return general_storage_order<NumDims>(ordering.begin(), + ascending.begin()); + } +#endif + }; + template <typename T, std::size_t NumDims> class multi_array_ref_openfpm : @@ -102,7 +186,7 @@ public: const detail::multi_array:: extent_gen<NumDims>& ranges, - const general_storage_order<NumDims>& so) : + const general_storage_order_ofp<NumDims>& so) : super_type(base,ranges,so) { } @@ -136,9 +220,35 @@ public: return *this; } + multi_array_ref_openfpm & bind_ref(const multi_array_ref_openfpm & other) { + if (&other != this) { + + this->base_ = other.base_; + this->storage_ = other.storage_; + this->extent_list_ = other.extent_list_; + this->stride_list_ = other.stride_list_; + this->index_base_list_ = other.index_base_list_; + this->origin_offset_ = other.origin_offset_; + this->directional_offset_ = other.directional_offset_; + this->num_elements_ = other.num_elements_; + + } + return *this; + } + + /* \brief Set the internal pointer + * + * \param base internal pointer + * + */ + void set_pointer(void * base) + { + this->base_ = static_cast<T *>(base); + } + multi_array_ref_openfpm & operator=(multi_array_ref_openfpm && other) { - this->base_ = other.base; + this->base_ = other.base_; this->storage_ = other.storage_; this->extent_list_ = other.extent_list_; this->stride_list_ = other.stride_list_; diff --git a/src/util/ct_array.hpp b/src/util/ct_array.hpp index 93e21ce1..a66b84b4 100644 --- a/src/util/ct_array.hpp +++ b/src/util/ct_array.hpp @@ -18,6 +18,7 @@ #ifndef CT_ARRAY_HPP_ #define CT_ARRAY_HPP_ + #include <boost/fusion/mpl.hpp> /////////////////////////////////////////////////// Make indexes /////////////////////// @@ -73,14 +74,14 @@ struct generate_indexes { /////////////////////////////////////////////////// #ifndef COVERTY_SCAN -#if __CUDACC_VER_MAJOR__ >= 9 //! the array itself template<class T, unsigned long... args> struct ArrayHolder_constexpr { //! compile-time array - static constexpr T data[sizeof...(args)] = { args... }; + constexpr static T data[sizeof...(args)] = { args... }; }; + //! recursive meta-function to generate compile-time array template<class T,size_t N, size_t orig_N, template<size_t,size_t> class F, unsigned... args> struct generate_array_constexpr_impl { @@ -121,7 +122,6 @@ struct generate_array_constexpr { typedef typename generate_array_constexpr_impl<T,N-1, N, F>::result result; }; -#endif #endif ////////////////////////////////////////////////// -- GitLab