Commit 5e99cd3b authored by incardon's avatar incardon
Browse files

Bringing up GPU grid computation

parent c98064f1
/*
* cuda_gpu_func.cpp
*
* Created on: Jun 3, 2018
* Author: i-bird
*/
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "Grid/map_grid.hpp"
#include "Point_test.hpp"
BOOST_AUTO_TEST_SUITE( grid_gpu_func_test )
BOOST_AUTO_TEST_CASE (gpu_computation_func)
{
#ifdef CUDA_GPU
size_t sz[3] = {64,64,64};
grid_gpu<3, Point_test<float> > c3(sz);
grid_key_dx<3> k1({1,1,1});
grid_key_dx<3> k2({62,62,62});
c3.setMemory();
auto gcf = c3.getGPUIterator(k1,k2);
BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul);
BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.x,4ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul);
grid_key_dx<3> k3({50,50,50});
grid_key_dx<3> k4({62,62,62});
grid_key_dx<3> k5({60,61,62});
auto gcf2 = c3.getGPUIterator(k3,k4);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,16ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.x,1ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul);
gcf2 = c3.getGPUIterator(k3,k4,511);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul);
gcf2 = c3.getGPUIterator(k3,k4,1);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,1ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,1ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,1ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.x,13ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,13ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,13ul);
gcf2 = c3.getGPUIterator(k3,k5,32);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,4ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,4ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.x,3ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,3ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,7ul);
gcf2 = c3.getGPUIterator(k3,k5,1);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,1ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,1ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,1ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.x,11ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,12ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,13ul);
#endif
}
BOOST_AUTO_TEST_SUITE_END()
#include "config.h"
#include <Grid/map_grid.hpp>
#include "Point_test.hpp"
__global__ void compute_stencil_grid(grid_gpu_ker<3,Point_test<float>> g1, grid_gpu_ker<3,Point_test<float>> g2, ite_gpu & ite_gpu)
{
GRID_ID_3(ite_gpu);
/* g2.template get<0>(key) = g1.template get<0>(key.move(0,1)) + g1.template get<0>(key.move(0,-1)) +
g1.template get<0>(key.move(1,1)) + g1.template get<0>(key.move(1,-1)) +
g1.template get<0>(key.move(2,1)) + g1.template get<0>(key.move(2,-1)) -
6.0*g1.template get<0>(key);*/
}
__global__ void compute_stencil(float * prp_0, float * prp_1, int sz, grid_key_dx<3> start, grid_key_dx<3> stop)
{
GRID_ID_3_TRAW(start,stop);
prp_1[tz*sz*sz + ty*sz + tx] = prp_0[tz*sz*sz + ty*sz + tx + 1] + prp_0[tz*sz*sz + ty*sz + tx - 1] +
prp_0[tz*sz*sz + (ty + 1)*sz + tx] + prp_0[tz*sz*sz + (ty - 1)*sz + tx] +
prp_0[(tz + 1)*sz*sz + ty*sz + tx + 1] + prp_0[(tz - 1)*sz*sz + ty*sz + tx - 1] -
6.0*prp_0[tz*sz*sz + ty*sz + tx];
}
__global__ void fill_one(float * prp_0,int sz)
{
// Thread index
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int tz = threadIdx.z + blockIdx.z * blockDim.z;
prp_0[tz*sz*sz + ty*sz + tx] = 1.0f;
}
__global__ void fill_count(float * prp_0,int sz)
{
// Thread index
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int tz = threadIdx.z + blockIdx.z * blockDim.z;
prp_0[tz*sz*sz + ty*sz + tx] = tz*sz*sz + ty*sz + tx;
}
// call compute
void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g)
{
// Setup execution parameters
dim3 threads(8,8,8);
dim3 grid(8,8,8);
float * prp_0 = (float *)g.getDeviceBuffer<0>();
fill_one<<< grid, threads >>>(prp_0,64);
}
// call compute
void gpu_grid_3D_compute(grid_gpu<3,Point_test<float>> & g)
{
// Setup execution parameters
dim3 threads(8,8,8);
dim3 grid(8,8,8);
float * prp_0 = (float *)g.getDeviceBuffer<0>();
fill_count<<< grid, threads >>>(prp_0,64);
}
void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
// Setup execution parameters
float * prp_0 = (float *)g1.getDeviceBuffer<0>();
float * prp_1 = (float *)g2.getDeviceBuffer<0>();
auto gpu_it = g2.getGPUIterator(start,stop);
compute_stencil<<< gpu_it.thr, gpu_it.wthr >>>(prp_0,prp_1,64,start,stop);
}
void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
auto gpu_it = g2.getGPUIterator(start,stop);
auto g1k = g1.toGPU();
compute_stencil_grid<<< gpu_it.thr, gpu_it.wthr >>>(g1k,g1k,gpu_it);
}
/*
* cuda_gpu_compute.cuh
*
* Created on: Sep 29, 2017
* Author: i-bird
*/
#ifndef OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_
#define OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_
void gpu_grid_3D_compute(grid_gpu<3,Point_test<float>> & g);
void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
grid_key_dx<3> & key1, grid_key_dx<3> & key2);
void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g);
void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
grid_key_dx<3> & start, grid_key_dx<3> & stop);
#endif /* OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_ */
......@@ -68,11 +68,6 @@ struct frswap
}
};
#ifdef __NVCC__
#else
#define __host__
#define __device__
#endif
//! Case memory_traits_lin
template<unsigned int p, typename layout, typename data_type, typename g1_type, typename key_type, unsigned int sel = 2*is_layout_mlin<layout>::value + is_layout_inte<layout>::value >
......
......@@ -10,7 +10,45 @@
#include "grid_base_impl_layout.hpp"
#ifdef __NVCC__
#ifdef CUDA_GPU
#ifndef __NVCC__
#undef __host__
#undef __device__
#include <vector_types.h>
#endif
#define GRID_ID_3_RAW(start,stop) int x[3] = {threadIdx.x + blockIdx.x * blockDim.x + start.get(0),\
threadIdx.y + blockIdx.y * blockDim.y + start.get(1),\
threadIdx.z + blockIdx.z * blockDim.z + start.get(2)};\
\
if (x[0] > stop.get(0) || x[1] > stop.get(1) || x[2] > stop.get(2))\
{return;}
#define GRID_ID_3_TRAW(start,stop) int tx = threadIdx.x + blockIdx.x * blockDim.x + start.get(0);\
int ty = threadIdx.y + blockIdx.y * blockDim.y + start.get(1);\
int tz = threadIdx.z + blockIdx.z * blockDim.z + start.get(2);\
\
if (tx > stop.get(0) || ty > stop.get(1) || tz > stop.get(2))\
{return;}
#define GRID_ID_3(ite_gpu) grid_key_dx<3> key;\
key.set_d(0,threadIdx.x + blockIdx.x * blockDim.x + ite_gpu.start.get(0));\
key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\
key.set_d(2,threadIdx.z + blockIdx.z * blockDim.z + ite_gpu.start.get(2));\
\
if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\
{return;}
struct ite_gpu
{
dim3 thr;
dim3 wthr;
grid_key_dx<3> start;
grid_key_dx<3> stop;
};
#else
#define __host__
#define __device__
......@@ -328,6 +366,45 @@ public:
return grid_new;
}
#ifdef CUDA_GPU
struct ite_gpu getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024)
{
size_t n = n_thr;
// Work to do
ite_gpu ig;
ig.thr.x = 1;
ig.thr.y = 1;
ig.thr.z = 1;
int dir = 0;
while (n != 1)
{
if (dir % 3 == 0)
{ig.thr.x = ig.thr.x << 1;}
else if (dir % 3 == 1)
{ig.thr.y = ig.thr.y << 1;}
else if (dir % 3 == 2)
{ig.thr.z = ig.thr.z << 1;}
n = n >> 1;
dir++;
}
ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0);
ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0);
ig.wthr.z = (key2.get(2) - key1.get(2) + 1) / ig.thr.z + (((key2.get(2) - key1.get(2) + 1)%ig.thr.z != 0)?1:0);
ig.start = key1;
ig.stop = key2;
return ig;
}
#endif
/*! \brief Return the internal grid information
*
* Return the internal grid information
......
/*
* grid_gpu.hpp
*
* Created on: Oct 31, 2015
* Author: i-bird
*/
#ifndef OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_
#define OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_
/*! \brief this class is a functor for "for_each" algorithm
*
* This class is a functor for "for_each" algorithm. For each
* element of the boost::vector the operator() is called.
* Is mainly used to copy one encap into another encap object
*
* \tparam encap source
* \tparam encap dst
*
*/
template<typename T_type>
struct copy_memory_c
{
//! encapsulated source object
const typename memory_traits_inte<T_type>::type & src;
//! encapsulated destination object
typename memory_traits_inte_red<T_type>::type & dst;
/*! \brief constructor
*
* \param src source encapsulated object
* \param dst source encapsulated object
*
*/
inline copy_memory_c(const typename memory_traits_inte<T_type>::type & src,
typename memory_traits_inte_red<T_type>::type & dst)
:src(src),dst(dst)
{
};
//! It call the copy function for each property
template<typename T>
inline void operator()(T& t) const
{
boost::fusion::at_c<T::value>(dst).mem_r = boost::fusion::at_c<T::value>(src).mem_r;
}
};
struct dim3_
{
//! size in x dimension
unsigned int x;
//! size in y dimension
unsigned int y;
//! size in z dimension
unsigned int z;
};
template<unsigned int dim>
struct device_grid
{
//! number of treads in each block
dim3_ threads;
//! number of grid for the kernel execution
dim3_ grids;
};
/*! \brief This is an N-dimensional grid or an N-dimensional array with memory_traits_inte layout
*
* it is basically an N-dimensional Cartesian grid
*
* \tparam dim Dimensionality of the grid
* \tparam T type of object the grid store
* \tparam Mem memory layout
*
* ### Definition and allocation of a 3D grid on GPU memory
* \snippet grid_unit_tests.hpp Definition and allocation of a 3D grid on GPU memory
* ### Access a grid c3 of size sz on each direction
* \snippet grid_unit_tests.hpp Access a grid c3 of size sz on each direction
* ### Access to an N-dimensional grid with an iterator
* \snippet grid_unit_tests.hpp Access to an N-dimensional grid with an iterator
*
*/
template<unsigned int dim, typename T, typename S>
class grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> : public grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>
{
//! grid layout
typedef typename memory_traits_inte<T>::type layout;
public:
//! Object container for T, it is the return type of get_o it return a object type trough
// you can access all the properties of T
typedef typename grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::container container;
//! Default constructor
inline grid_cpu() THROW
:grid_base_impl<dim,T,S,layout,memory_traits_inte>()
{
}
/*! \brief create a grid from another grid
*
* \param g the grid to copy
*
*/
inline grid_cpu(const grid_cpu & g) THROW
:grid_base_impl<dim,T,S,layout,memory_traits_inte>(g)
{
}
/*! \brief create a grid of size sz on each direction
*
* \param sz grid size in each direction
*
*/
inline grid_cpu(const size_t & sz) THROW
:grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz)
{
}
//! Constructor allocate memory and give them a representation
inline grid_cpu(const size_t (& sz)[dim]) THROW
:grid_base_impl<dim,T,S,layout,memory_traits_inte>(sz)
{
}
/*! \brief It return the properties arrays.
*
* In case of Cuda memory it return the device pointers to pass to the kernels
*
*
*/
template<unsigned int id> void * getDeviceBuffer()
{
return boost::fusion::at_c<id>(this->data_).mem->getDevicePointer();
}
/*! \brief Synchronize the memory buffer in the device with the memory in the host
*
*
*/
template<unsigned int id> void deviceToHost()
{
return boost::fusion::at_c<id>(this->data_).mem->deviceToHost();
}
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
*
* The object created can be considered like a reference of the original
*
*/
grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> toGPU()
{
grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> g;
copy_memory_c<T> cp_mc(this->data_,g.data_);
boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(cp_mc);
return g;
}
/*! \brief When we switch to GPU mode the data-structure can be safely given to
* a kernel for computation
*
*
*/
void switchToGPU()
{
}
void switchToCPU()
{
}
};
//! short formula for a grid on gpu
template <unsigned int dim, typename T> using grid_gpu = grid_cpu<dim,T,CudaMemory,typename memory_traits_inte<T>::type>;
#endif /* OPENFPM_DATA_SRC_GRID_GRID_GPU_HPP_ */
......@@ -31,7 +31,7 @@ public:
}
//! Constructor
inline grid_key_dx()
__device__ __host__ inline grid_key_dx()
{}
/*! \brief Constructor from initializer list
......@@ -56,7 +56,7 @@ public:
* \param key copy constructor
*
*/
inline grid_key_dx(const grid_key_dx<dim> & key)
__device__ __host__ inline grid_key_dx(const grid_key_dx<dim> & key)
:grid_key_dx(key.k)
{
}
......@@ -66,7 +66,7 @@ public:
* \param k reference buffer
*
*/
inline grid_key_dx(const size_t (&k)[dim])
__device__ __host__ inline grid_key_dx(const size_t (&k)[dim])
{
for (size_t i = 0 ; i < dim ; i++)
this->k[i] = k[i];
......@@ -77,7 +77,7 @@ public:
* \param k reference buffer
*
*/
inline grid_key_dx(const long int (&k)[dim])
__device__ __host__ inline grid_key_dx(const long int (&k)[dim])
{
for (size_t i = 0 ; i < dim ; i++)
this->k[i] = k[i];
......@@ -110,6 +110,15 @@ public:
invert_assign(t...);
}
__device__ __host__ inline grid_key_dx<dim> move(int i, int m)
{
grid_key_dx<dim> tmp = *this;
tmp.k[i] += m;
return tmp;
}
/*! \brief Set to zero the key
*
*/
......@@ -380,7 +389,7 @@ public:
* \return the index value
*
*/
mem_id get(size_t i) const
__device__ __host__ mem_id get(size_t i) const
{
return k[i];
}
......@@ -393,9 +402,9 @@ public:
* \param id value to set
*
*/
void set_d(size_t i, mem_id id)
__device__ __host__ void set_d(size_t i, mem_id id)
{
#ifdef DEBUG
#ifdef SE_CLASS1
if (i >= dim)
std::cerr << "grid_key_dx error: " << __FILE__ << " " << __LINE__ << " try to access dimension " << i << " on a grid_key_dx of size " << dim << "\n";
......
......@@ -295,7 +295,8 @@ public:
* \return The linearization of the gk key shifted by c, or -1 if the check fail
*/
template<typename check=NoCheck> inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N]) const
template<typename check=NoCheck>
inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N]) const
{
mem_id lid;
......@@ -330,7 +331,8 @@ public:
* \return The linearization of the gk key shifted by c, or -1 if the check fail
*/
template<typename check=NoCheck> inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N], const size_t (&bc)[N]) const
template<typename check=NoCheck>
inline mem_id LinId(const grid_key_dx<N> & gk, const char sum_id[N], const size_t (&bc)[N]) const
{
mem_id lid;
......@@ -397,7 +399,7 @@ public:
*
*/
inline mem_id LinId(const size_t (& k)[N]) const
__device__ __host__ inline mem_id LinId(const size_t (& k)[N]) const