...
 
Commits (6)
......@@ -328,6 +328,29 @@ private:
gridSize.setDimensions(res);
}
// This is for testing stencil application in Host, remember to deviceToHost the sparse grid before doing this!
template <typename stencil, typename... Args>
void applyStencilInPlaceHost(Args... args)
{
// Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
unsigned int numScalars = indexBuffer.size() * dataChunkSize;
if (numScalars == 0) return;
SparseGridGpuKernels::applyStencilInPlaceHost
<dim,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
stencil>(
indexBuffer,
dataBuffer,
*this,
args...);
}
template <typename stencil, typename... Args>
void applyStencilInPlace(Args... args)
{
......@@ -347,6 +370,8 @@ private:
? numScalars / localThreadBlockSize
: 1 + numScalars / localThreadBlockSize;
constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1
CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
<dim,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
......@@ -354,7 +379,7 @@ private:
threadGridSize, localThreadBlockSize,
indexBuffer.toKernel(),
dataBuffer.toKernel(),
this->template toKernelNN<stencil::stencil_type::nNN>(),
this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
args...);
}
......@@ -378,6 +403,8 @@ private:
? numScalars / localThreadBlockSize
: 1 + numScalars / localThreadBlockSize;
constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1
setGPUInsertBuffer(threadGridSize, chunksPerBlock);
// setGPUInsertBuffer(threadGridSize, localThreadBlockSize);
......@@ -388,7 +415,7 @@ private:
threadGridSize, localThreadBlockSize,
indexBuffer.toKernel(),
dataBuffer.toKernel(),
this->template toKernelNN<stencil::stencil_type::nNN>(),
this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
args...);
// cudaDeviceSynchronize();
......@@ -456,13 +483,13 @@ public:
return toKer;
}
template<unsigned int nNN>
template<unsigned int nNN, unsigned int nLoop>
SparseGridGpu_ker
<
dim,
blockEdgeSize,
typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
ct_par<nNN,1>,
ct_par<nNN,nLoop>,
indexT,
layout_base,
decltype(extendedBlockGeometry),
......@@ -474,7 +501,7 @@ public:
dim,
blockEdgeSize,
typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
ct_par<nNN,1>,
ct_par<nNN,nLoop>,
indexT,
layout_base,
decltype(extendedBlockGeometry),
......@@ -490,6 +517,18 @@ public:
return toKer;
}
//
constexpr static unsigned int getBlockEdgeSize()
{
return blockEdgeSize;
}
constexpr unsigned int getBlockSize() const
{
return blockSize;
}
// Geometry
template<typename CoordT>
inline size_t getLinId(CoordT &coord)
......@@ -549,13 +588,17 @@ public:
unsigned int threadGridSize = numScalars % dataChunkSize == 0
? numScalars / dataChunkSize
: 1 + numScalars / dataChunkSize;
constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * 1)>::value; // todo: This works only for stencilSupportSize==1
// constexpr unsigned int nLoop = IntPow<blockEdgeSize + 2, dim>::value; // todo: This works only for stencilSupportSize==1
if (stencilSupportRadius == 1)
{
CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
dim,
1,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN>(), nn_blocks.toKernel());
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel());
}
else if (stencilSupportRadius == 2)
{
......@@ -563,7 +606,7 @@ public:
dim,
2,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN>(), nn_blocks.toKernel());
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel());
}
else if (stencilSupportRadius == 0)
{
......@@ -571,7 +614,7 @@ public:
dim,
0,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN>(), nn_blocks.toKernel());
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel());
}
else
{
......@@ -604,6 +647,26 @@ public:
threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
}
template<typename stencil, typename... Args>
void applyStencilsHost(StencilMode mode = STENCIL_MODE_INPLACE, Args... args)
{
// Apply the given stencil on all elements which are not boundary-tagged
// The idea is to have this function launch a __global__ function (made by us) on all existing blocks
// then this kernel checks if elements exist && !padding and on those it calls the user-provided
// __device__ functor. The mode of the stencil application is used basically to choose how we load the block
// that we pass to the user functor as storeBlock: in case of Insert, we get the block through an insert (and
// we also call the necessary aux functions); in case of an In-place we just get the block from the data buffer.
switch (mode)
{
case STENCIL_MODE_INPLACE:
applyStencilInPlaceHost<stencil>(args...);
break;
case STENCIL_MODE_INSERT:
// applyStencilInsertHost<stencil>(args...);
std::cout << "No insert mode stencil application available on Host!" << std::endl;
break;
}
}
//todo: Move implems into a functor for compile time choice of stencil mode
template<typename stencil, typename... Args>
......@@ -636,19 +699,22 @@ public:
template<typename BitMaskT>
inline static bool isPadding(BitMaskT &bitMask)
{
return getBit(bitMask, PADDING_BIT);
return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>
::getBit(bitMask, PADDING_BIT);
}
template<typename BitMaskT>
inline static void setPadding(BitMaskT &bitMask)
{
setBit(bitMask, PADDING_BIT);
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>
::setBit(bitMask, PADDING_BIT);
}
template<typename BitMaskT>
inline static void unsetPadding(BitMaskT &bitMask)
{
unsetBit(bitMask, PADDING_BIT);
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>
::unsetBit(bitMask, PADDING_BIT);
}
template<unsigned int p>
......
......@@ -146,5 +146,95 @@ struct loadGhostBlock_impl<1,dim,AggregateBlockT,p,ct_params,blockEdgeSize>
}
};
template<unsigned int dim, typename AggregateBlockT, unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl<3,dim,AggregateBlockT,p,ct_params,blockEdgeSize>
{
template<typename AggrWrapperT, typename SharedPtrT, typename vector_type, typename vector_type2, typename blockMapType>
__device__ static inline void load(const AggrWrapperT &block,
SharedPtrT * sharedRegionPtr,
const vector_type & ghostLayerToThreadsMapping,
const vector_type2 & nn_blocks,
const blockMapType & blockMap,
unsigned int stencilSupportRadius,
unsigned int ghostLayerSize,
const unsigned int blockIdPos)
{
typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
const int pos = threadIdx.x % ghostLayerSize;
__shared__ int neighboursPos[ct_params::nNN];
const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos + 2*blockDim.x);
// Convert pos into a linear id accounting for the inner domain offsets
const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos + 2*blockDim.x);
// Now get linear offset wrt the first element of the block
int ctr = linId;
int ctr2 = linId2;
int ctr3 = linId3;
unsigned int acc = 1;
unsigned int offset = 0;
unsigned int offset2 = 0;
unsigned int offset3 = 0;
for (int i = 0; i < dim; ++i)
{
int v = (ctr % edge) - stencilSupportRadius;
int v2 = (ctr2 % edge) - stencilSupportRadius;
int v3 = (ctr3 % edge) - stencilSupportRadius;
v = (v < 0)?(v + blockEdgeSize):v;
v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
offset += v*acc;
offset2 += v2*acc;
offset3 += v3*acc;
ctr /= edge;
ctr2 /= edge;
ctr3 /= edge;
acc *= blockEdgeSize;
}
// Convert pos into a linear id accounting for the ghost offsets
unsigned int coord[dim];
linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
if (threadIdx.x < ct_params::nNN)
{
neighboursPos[threadIdx.x] = nnb;
}
__syncthreads();
// Actually load the data into the shared region
auto nPos = neighboursPos[neighbourNum];
auto nPos2 = neighboursPos[neighbourNum2];
auto nPos3 = neighboursPos[neighbourNum3];
auto gdata = blockMap.template get_ele<p>(nPos)[offset];
auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];
// Actually load the data into the shared region
//ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
auto bdata = block.template get<p>()[threadIdx.x];
sharedRegionPtr[linId] = gdata;
sharedRegionPtr[linId2] = gdata2;
sharedRegionPtr[linId3] = gdata3;
sharedRegionPtr[linId_b] = bdata;
}
};
#endif /* SPARSEGRIDGPU_KER_UTIL_HPP_ */
......@@ -48,12 +48,12 @@ namespace SparseGridGpuKernels
return;
}
const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
const long long dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
auto dataBlock = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
if (dataBlockId < 0)
{
printf("Negative Datablock \n");
printf("Negative Datablock: dataBlockId=%ld, dataBlockPos=%u \n", dataBlockId, dataBlockPos);
return;
}
......@@ -132,6 +132,63 @@ namespace SparseGridGpuKernels
nn_blocks.template get<0>(dataBlockPos*nNN + offset) = neighbourPos;
}
template <unsigned int dim,
unsigned int pMask,
typename stencil,
typename IndexBufT,
typename DataBufT,
typename SparseGridT,
typename... Args>
__host__ void applyStencilInPlaceHost(
IndexBufT &indexBuffer,
DataBufT &dataBuffer,
SparseGridT & sparseGrid,
Args... args)
{
constexpr unsigned int pIndex = 0;
typedef typename IndexBufT::value_type IndexAggregateT;
typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
typedef typename DataBufT::value_type AggregateT;
typedef BlockTypeOf<AggregateT, pMask> MaskBlockT;
typedef ScalarTypeOf<AggregateT, pMask> MaskT;
constexpr unsigned int blockSize = MaskBlockT::size;
const auto bufferSize = indexBuffer.size();
for (size_t blockId=0; blockId<bufferSize; ++blockId)
{
for (size_t elementId=0; elementId<blockSize; ++elementId)
{
const unsigned int dataBlockPos = blockId;
const unsigned int offset = elementId;
auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
// todo: Add management of RED-BLACK stencil application! :)
const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
// Read local mask to register
const auto curMask = dataBlockLoad.template get<pMask>()[offset];
grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
bool applyStencilHere = true;
if ((!sparseGrid.exist(curMask)) || sparseGrid.isPadding(curMask) || offset > blockSize)
{
// return; // We want to apply only on existing AND non-padding elements
applyStencilHere = false;
}
openfpm::sparse_index<unsigned int> sdataBlockPos;
sdataBlockPos.id = dataBlockPos;
stencil::stencilHost(
sparseGrid, dataBlockId, sdataBlockPos, offset, pointCoord, dataBlockLoad, dataBlockLoad,
applyStencilHere, args...);
}
}
}
template <unsigned int dim,
unsigned int pMask,
typename stencil,
......
......@@ -20,4 +20,10 @@ struct IntPow<base, 0>
constexpr static size_t value = 1;
};
template <unsigned int numerator, unsigned int denominator>
struct UIntDivCeil
{
constexpr static unsigned int value = numerator / denominator + (numerator%denominator!=0);
};
#endif //OPENFPM_PDATA_MATHUTILS_HPP
......@@ -193,9 +193,6 @@ __global__ void copyBlocksToOutput(SparseGridType sparseGrid, VectorOutType outp
z++;
}
template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
struct HeatStencil
{
......@@ -224,7 +221,7 @@ struct HeatStencil
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
const openfpm::sparse_index<unsigned int> dataBlockIdPos,
unsigned int offset,
const unsigned int offset,
const grid_key_dx<dim, int> & pointCoord,
const DataBlockWrapperT & dataBlockLoad,
DataBlockWrapperT & dataBlockStore,
......@@ -239,7 +236,7 @@ struct HeatStencil
__shared__ ScalarT enlargedBlock[enlargedBlockSize];
sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockIdPos, enlargedBlock);
sparseGrid.loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock);
__syncthreads();
......@@ -260,9 +257,72 @@ struct HeatStencil
}
enlargedBlock[linId] = cur + dt * laplacian;
}
}
__syncthreads();
sparseGrid.storeBlock<p_dst>(dataBlockStore, enlargedBlock);
/*! \brief Stencil Host function
*
* \param sparseGrid This is the sparse grid data-structure
* \param dataBlockId The id of the block
* \param offset index in local coordinate of the point where we are working
* \param dataBlockLoad dataBlock from where we read
* \param dataBlockStore dataBlock from where we write
* \param isActive the point is active if exist and is not padding
* \param dt delta t
*
*
*/
template<typename SparseGridT, typename DataBlockWrapperT>
static inline __host__ void stencilHost(
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
const openfpm::sparse_index<unsigned int> dataBlockIdPos,
const unsigned int offset,
const grid_key_dx<dim, int> & pointCoord,
const DataBlockWrapperT & dataBlockLoad,
DataBlockWrapperT & dataBlockStore,
bool isActive,
float dt)
{
constexpr unsigned int blockEdgeSize = SparseGridT::getBlockEdgeSize();
if (isActive)
{
auto cur = dataBlockLoad.template get<p_src>()[offset];
auto laplacian = -2.0 * dim * cur; // The central part of the stencil
auto neighbourCoord = pointCoord;
auto counter = offset;
unsigned int dimStride = 1;
for (int d = 0; d < dim; ++d)
{
const auto localOffset = counter % blockEdgeSize;
if (localOffset == 0) // This means we are at the lower boundary for this dimension
{
neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
laplacian += sparseGrid.template get<p_src>(neighbourCoord);
neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
}
else
{
laplacian += dataBlockLoad.template get<p_src>()[offset - dimStride];
}
if (localOffset == blockEdgeSize - 1) // This means we are at the lower boundary for this dimension
{
neighbourCoord.set_d(d, neighbourCoord.get(d) + 1);
laplacian += sparseGrid.template get<p_src>(neighbourCoord);
neighbourCoord.set_d(d, neighbourCoord.get(d) - 1);
}
else
{
laplacian += dataBlockLoad.template get<p_src>()[offset + dimStride];
}
//
counter /= blockEdgeSize;
dimStride *= blockEdgeSize;
}
dataBlockStore.template get<p_dst>()[offset] = cur + dt * laplacian;
}
}
template <typename SparseGridT, typename CtxT>
......@@ -356,6 +416,98 @@ void testStencilHeat_perf(unsigned int i)
report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
}
template<typename SparseGridZ>
void testStencilHeatHost_perf(unsigned int i)
{
// todo: Make sure to reimplement the host stencil application function to pre-load to a block of memory both content and ghost
// this way we can avoid binary searches...
auto testName = "In-place stencil HOST";
unsigned int gridEdgeSize = 1024;
typedef HeatStencil<SparseGridZ::dims,0,1> StencilT;
std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").stencil");
report_sparsegrid_funcs.graphs.put(base + ".dim",2);
report_sparsegrid_funcs.graphs.put(base + ".blockSize",8);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*SparseGridZ::blockEdgeSize_);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*SparseGridZ::blockEdgeSize_);
// unsigned int iterations = 100;
unsigned int iterations = 10;
// unsigned int iterations = 2;
// unsigned int iterations = 1; // Debug
openfpm::vector<double> measures_gf;
openfpm::vector<double> measures_tm;
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
typename SparseGridZ::grid_info blockGeometry(gridSize);
SparseGridZ sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
// Initialize the grid
sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
dim3 sourcePt(gridSize.x * SparseGridZ::blockEdgeSize_ / 2, gridSize.y * SparseGridZ::blockEdgeSize_ / 2, 0);
insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
cudaDeviceSynchronize();
sparseGrid.template deviceToHost<0>();
for (unsigned int iter=0; iter<iterations; ++iter)
{
cudaDeviceSynchronize();
timer ts;
ts.start();
sparseGrid.template applyStencilsHost<StencilT>(STENCIL_MODE_INPLACE, 0.1);
cudaDeviceSynchronize();
ts.stop();
measures_tm.add(ts.getwct());
float gElemS = numElements / (1e9 * ts.getwct());
float gFlopsS = gElemS * StencilT::flops;
measures_gf.add(gFlopsS);
}
double mean_tm = 0;
double deviation_tm = 0;
standard_deviation(measures_tm,mean_tm,deviation_tm);
double mean_gf = 0;
double deviation_gf = 0;
standard_deviation(measures_gf,mean_gf,deviation_gf);
// All times above are in ms
float gElemS = numElements / (1e9 * mean_tm);
float gFlopsS = gElemS * StencilT::flops;
std::cout << "Test: " << testName << std::endl;
std::cout << "Grid: " << gridEdgeSize*SparseGridZ::blockEdgeSize_ << "x" << gridEdgeSize*SparseGridZ::blockEdgeSize_ << std::endl;
std::cout << "Iterations: " << iterations << std::endl;
std::cout << "\tStencil: " << mean_gf << " dev:" << deviation_gf << " s" << std::endl;
std::cout << "Throughput: " << std::endl << "\t " << gElemS << " GElem/s " << std::endl << "\t " << gFlopsS << " GFlops/s" << std::endl;
report_sparsegrid_funcs.graphs.put(base + ".Gflops.mean",mean_gf);
report_sparsegrid_funcs.graphs.put(base +".Gflops.dev",deviation_gf);
report_sparsegrid_funcs.graphs.put(base + ".time.mean",mean_tm);
report_sparsegrid_funcs.graphs.put(base +".time.dev",deviation_tm);
}
BOOST_AUTO_TEST_SUITE(performance)
BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
......@@ -386,6 +538,19 @@ BOOST_AUTO_TEST_CASE(testStencilHeatZ)
testStencilHeat_perf<SparseGridGpu_z<dim, AggregateT, blockEdgeSize, chunkSize>>(1);
}
BOOST_AUTO_TEST_CASE(testStencilHeat_Host)
{
constexpr unsigned int dim = 2;
constexpr unsigned int blockEdgeSize = 8;
typedef aggregate<float,float> AggregateT;
constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
report_sparsegrid_funcs.graphs.put("performance.SparseGridGpu(0).stencil.test.name","StencilN");
testStencilHeatHost_perf<SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize>>(0);
}
void testInsertStencil(unsigned int gridEdgeSize, unsigned int i)
{
auto testName = "Insert stencil";
......
......@@ -1001,6 +1001,63 @@ __global__ void insertSphere(SparseGridType sparseGrid, grid_key_dx<2,int> start
sparseGrid.flush_block_insert();
}
template<unsigned int p, typename SparseGridType>
__global__ void insertSphere3D(SparseGridType sparseGrid, grid_key_dx<3,int> start, float r1, float r2)
{
constexpr unsigned int pMask = SparseGridType::pMask;
typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
grid_key_dx<3,int> blk({
blockIdx.x + start.get(0) / sparseGrid.getBlockEdgeSize(),
blockIdx.y + start.get(1) / sparseGrid.getBlockEdgeSize(),
blockIdx.z + start.get(2) / sparseGrid.getBlockEdgeSize()});
unsigned int offset = threadIdx.x;
__shared__ bool is_block_empty;
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{is_block_empty = true;}
sparseGrid.init();
auto blockId = sparseGrid.getBlockLinId(blk);
grid_key_dx<3,int> keyg;
keyg = sparseGrid.getGlobalCoord(blk,offset);
const mem_id x = keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
const mem_id y = keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
const mem_id z = keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
float radius = sqrt((float) (x*x + y*y + z*z));
bool is_active = radius < r1 && radius > r2;
if (is_active == true)
{
is_block_empty = false;
}
__syncthreads();
if (is_block_empty == false)
{
auto ec = sparseGrid.insertBlockNew(blockId);
if ( is_active == true)
{
// ec.template get<p>()[offset] = 1;
ec.template get<p>()[offset] = x;
BlockMapGpu_ker<>::setExist(ec.template get<pMask>()[offset]);
}
}
__syncthreads();
sparseGrid.flush_block_insert();
}
BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput)
{
constexpr unsigned int dim = 2;
......@@ -1030,6 +1087,38 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput)
sparseGrid.write("SparseGridGPU_output.vtk");
}
BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3D)
{
constexpr unsigned int dim = 3;
constexpr unsigned int blockEdgeSize = 4;
typedef aggregate<float> AggregateT;
size_t sz[3] = {512,512,512};
// dim3 gridSize(128,128,128);
dim3 gridSize(32,32,32);
grid_smb<dim, blockEdgeSize> blockGeometry(sz);
SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
grid_key_dx<3,int> start({256,256,256});
// Insert values on the grid
sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
CUDA_LAUNCH_DIM3((insertSphere3D<0>),
gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
sparseGrid.toKernel(), start, 64, 56);
sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
sparseGrid.tagBoundaries();
sparseGrid.template deviceToHost<0>();
sparseGrid.write("SparseGridGPU_output3D.vtk");
}
#endif
BOOST_AUTO_TEST_SUITE_END()