Commit 5dbbc3f5 authored by incardon's avatar incardon

Fixing CUDA compilation + AMR distributed GPU working

parent 95bc0f71
openfpm_data @ b7d41928
Subproject commit 5056c72513114e2491a8fc1b4ddd4899ecd3e242
Subproject commit b7d4192813d4aaccd703df1472b9bc9da6c9a7b6
......@@ -466,6 +466,20 @@ public:
return gd_array.get(lvl).getGridIterator();
}
#ifdef __NVCC__
/*! \brief Get an iterator to the grid
*
* \return an iterator to the grid
*
*/
auto getGridIteratorGPU(size_t lvl) -> decltype(gd_array.get(lvl).getGridIteratorGPU())
{
return gd_array.get(lvl).getGridIteratorGPU();
}
#endif
/*! \brief Get an iterator to the grid
*
* \return an iterator to the grid
......@@ -912,7 +926,7 @@ using sgrid_dist_amr = grid_dist_amr<dim,St,T,AMR_IMPL_TRIVIAL,CartDecomposition
#ifdef __NVCC__
template<unsigned int dim, typename St, typename T, unsigned int blockEdgeSize = 8>
using sgrid_dist_amr_gpu = grid_dist_amr<dim,St,T,AMR_IMPL_TRIVIAL,CartDecomposition<dim,St,CudaMemory,memory_traits_inte>,CudaMemory,SparseGridGpu<dim,T,blockEdgeSize>>;
using sgrid_dist_amr_gpu = grid_dist_amr<dim,St,T,AMR_IMPL_TRIVIAL,CartDecomposition<dim,St,CudaMemory,memory_traits_inte>,CudaMemory,SparseGridGpu<dim,T,blockEdgeSize,IntPow<blockEdgeSize,dim>::value >>;
#endif
......
......@@ -18,6 +18,58 @@
#include "Point_test.hpp"
#include "Grid/tests/grid_dist_id_util_tests.hpp"
struct amr_launch_sparse
{
template<typename grid_type, typename ite_type>
__device__ void operator()(grid_type & grid, ite_type itg, float spacing, Point<3,float> center)
{
GRID_ID_3_GLOBAL(itg);
__shared__ bool is_block_empty;
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{is_block_empty = true;}
grid.init();
int offset = 0;
grid_key_dx<3,int> blk;
bool out = grid.getInsertBlockOffset(itg,key,blk,offset);
auto blockId = grid.getBlockLinId(blk);
const float x = keyg.get(0)*spacing - center.get(0);
const float y = keyg.get(1)*spacing - center.get(1);
const float z = keyg.get(2)*spacing - center.get(2);
float radius = sqrt((float) (x*x + y*y + z*z));
bool is_active = radius < 0.4 && radius > 0.3;
if (is_active == true)
{is_block_empty = false;}
__syncthreads();
if (is_block_empty == false)
{
auto ec = grid.insertBlock(blockId);
if ( is_active == true)
{
ec.template get<0>()[offset] = x+y+z;
ec.template get<grid_type::pMask>()[offset] = 1;
}
}
__syncthreads();
grid.flush_block_insert();
}
};
BOOST_AUTO_TEST_SUITE( amr_grid_dist_id_test )
......@@ -27,39 +79,73 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_amr_gpu )
Box<3,float> domain3({0.0,0.0,0.0},{1.0,1.0,1.0});
Ghost<3,float> g(0.05);
Ghost<3,long int> g(1);
sgrid_dist_amr_gpu<3,float,aggregate<float>> amr_g(domain3,g);
size_t g_sz[3] = {4,4,4};
size_t n_lvl = 10;
size_t n_lvl = 6;
// amr_g.initLevels(n_lvl,g_sz);
amr_g.initLevels(n_lvl,g_sz);
/* for (size_t i = 0 ; i < amr_g.getNLvl() ; i++)
for (size_t i = 0 ; i < amr_g.getNLvl() ; i++)
{
// Fill the AMR with something
size_t count = 0;
auto it = amr_g.getGridIterator(i);
auto it = amr_g.getGridIteratorGPU(i);
it.setGPUInsertBuffer(1);
while (it.isNext())
{
auto key = it.get_dist();
auto akey = amr_g.getAMRKey(i,key);
Point<3,float> center({0.5,0.5,0.5});
it.launch(amr_launch_sparse(),it.getSpacing(0),center);
amr_g.getDistGrid(i).template flush<smax_<0>>(FLUSH_ON_DEVICE);
amr_g.getDistGrid(i).template deviceToHost<0>();
amr_g.template insert<0>(akey) = 3.0;
auto it2 = amr_g.getDistGrid(i).getDomainIterator();
while (it2.isNext())
{
auto key = it2.get();
auto keyg = it2.getGKey(key);
count++;
++it;
++it2;
}
auto & v_cl = create_vcluster();
v_cl.sum(count);
v_cl.execute();
switch(i)
{
case 0:
BOOST_REQUIRE_EQUAL(count,0);
break;
case 1:
BOOST_REQUIRE_EQUAL(count,30);
break;
case 2:
BOOST_REQUIRE_EQUAL(count,282);
break;
case 3:
BOOST_REQUIRE_EQUAL(count,2192);
break;
case 4:
BOOST_REQUIRE_EQUAL(count,16890);
break;
case 5:
BOOST_REQUIRE_EQUAL(count,136992);
break;
}
}
// Iterate across all the levels initialized
auto it = amr_g.getDomainIterator();
/* auto it = amr_g.getDomainIterator();
size_t count = 0;
......
......@@ -93,24 +93,6 @@ class grid_dist_iterator
// get the next grid iterator
if (g_c < gList.size())
{
// Sub iterator are used
/* if (impl == FREE)
{
if (gdb_ext.get(g_c).Dbox.isValid() == false)
{g_c++;}
else
{
a_it.reinitialize(gList.get(g_c).getIterator(gdb_ext.get(g_c).Dbox.getKP1(),gdb_ext.get(g_c).Dbox.getKP2()));
if (a_it.isNext() == false) {g_c++;}
}
}
else
{
// Full iterator (no subset)
a_it.reinitialize(gList.get(g_c).getIterator());
if (a_it.isNext() == false) {g_c++;}
}*/
selvg<impl == FREE>::call(a_it,gdb_ext,gList,g_c);
}
} while (g_c < gList.size() && a_it.isNext() == false);
......@@ -276,12 +258,12 @@ class grid_dist_iterator
// Get the sub-domain id
size_t sub_id = k.getSub();
grid_key_dx<dim> k_glob = k.getKey();
auto k_glob = k.getKey();
// shift
k_glob = k_glob + gdb_ext.get(sub_id).origin;
auto k_glob2 = k_glob + gdb_ext.get(sub_id).origin;
return k_glob;
return k_glob2;
}
/*! \brief Return the stencil point offset
......
......@@ -8,9 +8,54 @@
#ifndef GRID_DIST_ID_KERNELS_CUH_
#define GRID_DIST_ID_KERNELS_CUH_
#ifdef CUDA_GPU
template<typename grid_type, typename func_t,typename ... args_t>
__global__ void grid_apply_functor(grid_type g, ite_gpu<grid_type::d> ite, func_t f, args_t ... args)
template<unsigned int dim>
struct ite_gpu_dist
{
dim3 thr;
dim3 wthr;
grid_key_dx<dim,int> start;
grid_key_dx<dim,int> stop;
grid_key_dx<dim,int> start_base;
grid_key_dx<dim,int> origin;
ite_gpu_dist(ite_gpu<dim> & ite)
{
thr = ite.thr;
wthr = ite.wthr;
start = ite.start;
stop = ite.stop;
}
size_t nblocks()
{
return wthr.x * wthr.y * wthr.z;
}
};
#define GRID_ID_3_GLOBAL(ite_gpu) grid_key_dx<3,int> key;\
grid_key_dx<3,int> keyg;\
key.set_d(0,threadIdx.x + blockIdx.x * blockDim.x + ite_gpu.start.get(0));\
key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\
key.set_d(2,threadIdx.z + blockIdx.z * blockDim.z + ite_gpu.start.get(2));\
\
keyg.set_d(0,key.get(0) + ite_gpu.origin.get(0));\
keyg.set_d(1,key.get(1) + ite_gpu.origin.get(1));\
keyg.set_d(2,key.get(2) + ite_gpu.origin.get(2));\
\
if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\
{return;}
#endif
template<typename grid_type, typename ite_gpu_type,typename func_t,typename ... args_t>
__global__ void grid_apply_functor(grid_type g, ite_gpu_type ite, func_t f, args_t ... args)
{
f(g,ite,args...);
}
......
......@@ -26,6 +26,7 @@
#ifdef __NVCC__
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "cuda/grid_dist_id_kernels.cuh"
#include "Grid/cuda/grid_dist_id_iterator_gpu.cuh"
#endif
......@@ -1134,8 +1135,8 @@ public:
*/
void setBackgroundValue(T & bv)
{
for (size_t i = 0 ; i < loc_grid.size() ; i++)
{meta_copy<T>::meta_copy_(bv,loc_grid.get(i).getBackgroundValue());}
setBackground_impl<T,decltype(loc_grid)> func(bv,loc_grid);
boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop>>(func);
}
/*! \brief set the background value
......@@ -1715,7 +1716,7 @@ public:
/*! /brief Get a grid Iterator
*
* In case of dense grid getGridIterator is equivalent to getDomainIterator
* in case if sparse distributed grid getDomainIterator go across all the
* in case of sparse grid getDomainIterator go across all the
* inserted point get grid iterator run across all grid points independently
* that the point has been insert or not
*
......@@ -1728,6 +1729,47 @@ public:
return it_dec;
}
#ifdef __NVCC__
/*! /brief Get a grid Iterator in GPU
*
* In case of dense grid getGridIterator is equivalent to getDomainIteratorGPU
* in case of sparse distributed grid getDomainIterator go across all the
* inserted point getGridIteratorGPU run across all grid points independently
* that the point has been insert or not
*
* \param start point
* \param stop point
*
* \return a Grid iterator
*
*/
inline grid_dist_id_iterator_gpu<Decomposition,openfpm::vector<device_grid>>
getGridIteratorGPU(const grid_key_dx<dim> & start, const grid_key_dx<dim> & stop)
{
grid_dist_id_iterator_gpu<Decomposition,openfpm::vector<device_grid>> it_dec(loc_grid,getDecomposition(), g_sz, start, stop);
return it_dec;
}
/*! /brief Get a grid Iterator in GPU
*
* In case of dense grid getGridIterator is equivalent to getDomainIteratorGPU
* in case of sparse distributed grid getDomainIterator go across all the
* inserted point getGridIteratorGPU run across all grid points independently
* that the point has been insert or not
*
* \return a Grid iterator
*
*/
inline grid_dist_id_iterator_gpu<Decomposition,openfpm::vector<device_grid>>
getGridIteratorGPU()
{
grid_dist_id_iterator_gpu<Decomposition,openfpm::vector<device_grid>> it_dec(loc_grid,getDecomposition(), g_sz);
return it_dec;
}
#endif
/*! /brief Get a grid Iterator running also on ghost area
*
* In case of dense grid getGridIterator is equivalent to getDomainIterator
......@@ -2641,6 +2683,8 @@ public:
#ifdef __NVCC__
/*! \brief Set the size of the gpu insert buffer pool
*
* Indicate the maximum number of inserts each GPU block can do
*
* \param size of the insert pool
*
......@@ -2664,7 +2708,12 @@ public:
loc_grid.get(i).setGPUInsertBuffer(ite.nblocks(),gpu_insert_pool_size);
loc_grid.get(i).initializeGPUInsertBuffer();
grid_apply_functor<<<ite.wthr,ite.thr>>>(loc_grid.get(i).toKernel(),ite,func_t(),args...);
ite_gpu_dist<dim> itd = ite;
for (int j = 0 ; j < dim ; j++)
{itd.origin.set_d(j,gdb_ext.get(i).origin.get(j));}
grid_apply_functor<<<ite.wthr,ite.thr>>>(loc_grid.get(i).toKernel(),itd,func_t(),args...);
it.nextGrid();
}
......
......@@ -8,8 +8,8 @@ BOOST_AUTO_TEST_SUITE( sgrid_gpu_test_suite )
template<unsigned int p>
struct insert_kernel
{
template<typename SparseGridGpu_type>
__device__ void operator()(SparseGridGpu_type & sg, ite_gpu<SparseGridGpu_type::d> & ite, float c)
template<typename SparseGridGpu_type, typename ite_type>
__device__ void operator()(SparseGridGpu_type & sg, ite_type & ite, float c)
{
sg.init();
......@@ -30,22 +30,34 @@ struct insert_kernel
{return;}
if (SparseGridGpu_type::d >= 2 && y+ite.start.get(1) > ite.stop.get(1))
{return;}
if (SparseGridGpu_type::d >= 3 && z+ite.start.get(1) > ite.stop.get(2))
if (SparseGridGpu_type::d >= 3 && z+ite.start.get(2) > ite.stop.get(2))
{return;}
grid_key_dx<SparseGridGpu_type::d, size_t> coord({x+ite.start.get(0), y+ite.start.get(1), z+ite.start.get(2)});
// size_t pos = sg.getLinId(coord);
// printf("insertValues: bDim=(%d,%d), bId=(%d,%d), tId=(%d,%d) : "
// "pos=%ld, coord={%d,%d}, value=%d\n",
// bDimX, bDimY,
// bIdX, bIdY,
// tIdX, tIdY,
// pos,
// x, y,
// x); //debug
sg.template insert<p>(coord) = c;
grid_key_dx<SparseGridGpu_type::d, size_t> coord;
grid_key_dx<SparseGridGpu_type::d, size_t> coord_glob;
if (SparseGridGpu_type::d >= 2)
{
coord.set_d(0,x+ite.start.get(0));
coord_glob.set_d(0,x+ite.start.get(0)+ite.origin.get(0));
coord.set_d(1,y+ite.start.get(1));
coord_glob.set_d(1,y+ite.start.get(1)+ite.origin.get(1));
}
else if (SparseGridGpu_type::d >= 3)
{
coord.set_d(0,x+ite.start.get(0));
coord_glob.set_d(0,x+ite.start.get(0)+ite.origin.get(0));
coord.set_d(1,y+ite.start.get(1));
coord_glob.set_d(1,y+ite.start.get(1)+ite.origin.get(1));
coord.set_d(2,z+ite.start.get(2));
coord_glob.set_d(2,z+ite.start.get(2)+ite.origin.get(2));
}
if (SparseGridGpu_type::d >= 2)
{sg.template insert<p>(coord) = c + coord_glob.get(0) + coord_glob.get(1);}
else
{sg.template insert<p>(coord) = c + coord_glob.get(0) + coord_glob.get(1) + coord_glob.get(2);}
__syncthreads();
......@@ -80,35 +92,6 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
gdist.template setBackgroundValue<0>(666);
/////// CPU insert
/* auto it = gdist.getGridIterator(box.getKP1(),box.getKP2());
while (it.isNext())
{
auto p = it.get_dist();
gdist.template insert<0>(p) = 1.0;
++it;
}
gdist.template flush<>();
Box<2,size_t> box2({0,0},{15,15});
auto it2 = gdist.getGridIterator(box2.getKP1(),box2.getKP2());
while (it2.isNext())
{
auto p = it2.get_dist();
std::cout << gdist.template get<0>(p) << std::endl;
++it2;
}*/
/////// host to device
/////// GPU insert + flush
Box<2,size_t> box({1,1},{1,1});
......@@ -116,7 +99,7 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
/////// GPU Run kernel
gdist.setInsertBuffer(128);
gdist.setInsertBuffer(1);
float c = 5.0;
......@@ -137,10 +120,16 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
if (p2.get(0) == box.getLow(0) && p2.get(1) == box.getLow(1))
{
BOOST_REQUIRE_EQUAL(gdist.template get<0>(p), 5.0);
BOOST_REQUIRE_EQUAL(gdist.template get<0>(p), 7.0);
}
else
{
if (gdist.template get<0>(p) != 666.0)
{
float f = gdist.template get<0>(p);
std::cout << "ERROR: " << gdist.template get<0>(p) << std::endl;
}
BOOST_REQUIRE_EQUAL(gdist.template get<0>(p), 666.0);
}
......@@ -148,6 +137,8 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
}
}
return;
//
c = 3.0;
......@@ -189,14 +180,50 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base )
++it;
}
}
}
////////////////////////////////////
BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output )
{
auto & v_cl = create_vcluster();
if (v_cl.size() > 3){return;}
size_t sz[2] = {17,17};
periodicity<2> bc = {PERIODIC,PERIODIC};
Ghost<2,long int> g(1);
Box<2,float> domain({0.0,0.0},{1.0,1.0});
sgrid_dist_id_gpu<2,float,aggregate<float>> gdist(sz,domain,g,bc);
gdist.template setBackgroundValue<0>(666);
/////// GPU insert + flush
Box<2,size_t> box({1,1},{15,15});
auto it = gdist.getGridIterator(box.getKP1(),box.getKP2());
/////// GPU Run kernel
gdist.setInsertBuffer(128);
gdist.template iterateGPU<stencil_kernel<0>>();
float c = 5.0;
}
gdist.template iterateGridGPU<insert_kernel<0>>(it,c);
gdist.template flush<smax_<0>>(flush_type::FLUSH_ON_DEVICE);
gdist.template deviceToHost<0>();
gdist.write("sgrid_gpu_output");
std::string file_test("sgrid_gpu_output_" + std::to_string(v_cl.size()) + "_" + std::to_string(v_cl.rank()) + ".vtk");
std::string file("sgrid_gpu_output_" + std::to_string(v_cl.rank()) + ".vtk");
bool test = compare(file,"test_data/" + file_test);
BOOST_REQUIRE_EQUAL(true,test);
}
BOOST_AUTO_TEST_SUITE_END()
# vtk DataFile Version 3.0
grids
ASCII
DATASET POLYDATA
POINTS 225 float
0.058824 0.058824 0.0
0.117647 0.058824 0.0
0.176471 0.058824 0.0
0.235294 0.058824 0.0
0.294118 0.058824 0.0
0.352941 0.058824 0.0
0.058824 0.117647 0.0
0.117647 0.117647 0.0
0.176471 0.117647 0.0
0.235294 0.117647 0.0
0.294118 0.117647 0.0
0.352941 0.117647 0.0
0.058824 0.176471 0.0
0.117647 0.176471 0.0
0.176471 0.176471 0.0
0.235294 0.176471 0.0
0.294118 0.176471 0.0
0.352941 0.176471 0.0
0.058824 0.235294 0.0
0.117647 0.235294 0.0
0.176471 0.235294 0.0
0.235294 0.235294 0.0
0.294118 0.235294 0.0
0.352941 0.235294 0.0
0.058824 0.294118 0.0
0.117647 0.294118 0.0
0.176471 0.294118 0.0
0.235294 0.294118 0.0
0.294118 0.294118 0.0
0.352941 0.294118 0.0
0.058824 0.352941 0.0
0.117647 0.352941 0.0
0.176471 0.352941 0.0
0.235294 0.352941 0.0
0.294118 0.352941 0.0
0.352941 0.352941 0.0
0.411765 0.058824 0.0
0.470588 0.058824 0.0
0.529412 0.058824 0.0
0.588235 0.058824 0.0
0.647059 0.058824 0.0
0.705882 0.058824 0.0
0.764706 0.058824 0.0
0.823529 0.058824 0.0
0.411765 0.117647 0.0
0.470588 0.117647 0.0
0.529412 0.117647 0.0
0.588235 0.117647 0.0
0.647059 0.117647 0.0
0.705882 0.117647 0.0
0.764706 0.117647 0.0
0.823529 0.117647 0.0
0.411765 0.176471 0.0
0.470588 0.176471 0.0
0.529412 0.176471 0.0
0.588235 0.176471 0.0
0.647059 0.176471 0.0
0.705882 0.176471 0.0
0.764706 0.176471 0.0
0.823529 0.176471 0.0
0.411765 0.235294 0.0
0.470588 0.235294 0.0
0.529412 0.235294 0.0
0.588235 0.235294 0.0
0.647059 0.235294 0.0
0.705882 0.235294 0.0
0.764706 0.235294 0.0
0.823529 0.235294 0.0
0.411765 0.294118 0.0
0.470588 0.294118 0.0
0.529412 0.294118 0.0
0.588235 0.294118 0.0
0.647059 0.294118 0.0
0.705882 0.294118 0.0
0.764706 0.294118 0.0
0.823529 0.294118 0.0
0.411765 0.352941 0.0
0.470588 0.352941 0.0
0.529412 0.352941 0.0
0.588235 0.352941 0.0
0.647059 0.352941 0.0
0.705882 0.352941 0.0
0.764706 0.352941 0.0
0.823529 0.352941 0.0
0.882353 0.058824 0.0
0.882353 0.117647 0.0
0.882353 0.176471 0.0
0.882353 0.235294 0.0
0.882353 0.294118 0.0
0.882353 0.352941 0.0
0.058824 0.411765 0.0
0.117647 0.411765 0.0
0.176471 0.411765 0.0
0.235294 0.411765 0.0
0.294118 0.411765 0.0
0.352941 0.411765 0.0
0.058824 0.470588 0.0
0.117647 0.470588 0.0
0.176471 0.470588 0.0
0.235294 0.470588 0.0
0.294118 0.470588 0.0
0.352941 0.470588 0.0
0.058824 0.529412 0.0
0.117647 0.529412 0.0
0.176471 0.529412 0.0
0.235294 0.529412 0.0
0.294118 0.529412 0.0
0.352941 0.529412 0.0
0.058824 0.588235 0.0
0.117647 0.588235 0.0
0.176471 0.588235 0.0
0.235294 0.588235 0.0
0.294118 0.588235 0.0
0.352941 0.588235 0.0
0.058824 0.647059 0.0
0.117647 0.647059 0.0
0.176471 0.647059 0.0
0.235294 0.647059 0.0
0.294118 0.647059 0.0
0.352941 0.647059 0.0
0.058824 0.705882 0.0
0.117647 0.705882 0.0
0.176471 0.705882 0.0
0.235294 0.705882 0.0
0.294118 0.705882 0.0
0.352941 0.705882 0.0
0.058824 0.764706 0.0
0.117647 0.764706 0.0
0.176471 0.764706 0.0
0.235294 0.764706 0.0
0.294118 0.764706 0.0
0.352941 0.764706 0.0
0.058824 0.823529 0.0
0.117647 0.823529 0.0
0.176471 0.823529 0.0
0.235294 0.823529 0.0
0.294118 0.823529 0.0
0.352941 0.823529 0.0
0.411765 0.411765 0.0
0.470588 0.411765 0.0
0.529412 0.411765 0.0
0.588235 0.411765 0.0
0.647059 0.411765 0.0
0.705882 0.411765 0.0
0.764706 0.411765 0.0
0.823529 0.411765 0.0
0.411765 0.470588 0.0
0.470588 0.470588 0.0
0.529412 0.470588 0.0
0.588235 0.470588 0.0
0.647059 0.470588 0.0
0.705882 0.470588 0.0
0.764706 0.470588 0.0
0.823529 0.470588 0.0
0.411765 0.529412 0.0
0.470588 0.529412 0.0
0.529412 0.529412 0.0
0.588235 0.529412 0.0
0.647059 0.529412 0.0
0.705882 0.529412 0.0
0.764706 0.529412 0.0
0.823529 0.529412 0.0
0.411765 0.588235 0.0
0.470588 0.588235 0.0
0.529412 0.588235 0.0
0.588235 0.588235 0.0
0.647059 0.588235 0.0