Commit 0f8d618b authored by incardon's avatar incardon

Refactorizing SparseGrid

parent 8420a227
......@@ -54,7 +54,7 @@ if (CUDA_FOUND)
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -arch=sm_61 -g -lineinfo >)
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >)
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
......
......@@ -664,6 +664,32 @@ public:
return ret;
}
/*! \brief Get the point p1 as grid_key_dx
*
* \return the key
*
*/
grid_key_dx<dim,int> getKP1int() const
{
// grid key to return
grid_key_dx<dim,int> ret(boost::fusion::at_c<p1>(data));
return ret;
}
/*! \brief Get the point p12 as grid_key_dx
*
* \return the key
*
*/
grid_key_dx<dim,int> getKP2int() const
{
// grid key to return
grid_key_dx<dim,int> ret(boost::fusion::at_c<p2>(data));
return ret;
}
/*! \brief Get the point p1
*
* \return the point p1
......
......@@ -76,6 +76,13 @@ struct NNStar
static const int nNN = 8;
};
template<unsigned int nNN_, unsigned int nLoop_>
struct ct_par
{
static const unsigned int nNN = nNN_;
static const unsigned int nLoop = nLoop_;
};
template<unsigned int dim,
typename AggregateT,
unsigned int blockEdgeSize = default_edge<dim>::type::value,
......@@ -376,7 +383,7 @@ public:
dim,
blockEdgeSize,
typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
0,
ct_par<0,1>,
indexT,
layout_base,
decltype(extendedBlockGeometry),
......@@ -388,7 +395,7 @@ public:
dim,
blockEdgeSize,
typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
0,
ct_par<0,1>,
indexT,
layout_base,
decltype(extendedBlockGeometry),
......@@ -410,7 +417,7 @@ public:
dim,
blockEdgeSize,
typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
nNN,
ct_par<nNN,1>,
indexT,
layout_base,
decltype(extendedBlockGeometry),
......@@ -422,7 +429,7 @@ public:
dim,
blockEdgeSize,
typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
nNN,
ct_par<nNN,1>,
indexT,
layout_base,
decltype(extendedBlockGeometry),
......
......@@ -7,15 +7,13 @@
#include <SparseGridGpu/Geometry/grid_smb.hpp>
#include "BlockMapGpu.hpp"
constexpr int gt = 0;
constexpr int nt = 1;
#include "SparseGridGpu_ker_util.hpp"
//todo Remove template param GridSmT and just use BlockGeometry
template<unsigned int dim,
unsigned int blockEdgeSize,
typename AggregateBlockT,
unsigned int nNN,
typename ct_params,
typename indexT,
template<typename> class layout_base,
typename GridSmT,
......@@ -263,33 +261,6 @@ public:
__loadBlock<props ...>(block, sharedRegionPtr);
}
/**
* Load the ghost layer of a data block into the boundary part of a shared memory region.
* The given shared memory region should be shaped as a dim-dimensional array and sized so that it
* can contain the block plus the ghost layer around it.
*
* @tparam p The property to retrieve from global memory.
* @tparam CoordT The coordinate type.
* @tparam Shared The type of the shared memory region.
* @param coord The coordinate of the block.
* @param sharedRegionPtr The pointer to the shared memory region.
*/
/* template<unsigned int p, typename CoordT>
inline __device__ void
loadGhost(const grid_key_dx<dim, CoordT> & coord, ScalarTypeOf<AggregateBlockT, p> *sharedRegion)
{
//todo: Make this work well with multiples chunks per block or check not to get several chunks or dragons ahoy!
auto blockLinId = getBlockId(coord);
__loadGhost<p>(blockLinId, sharedRegion);
}
template<unsigned int p>
inline __device__ void
loadGhost(const unsigned int blockLinId, ScalarTypeOf<AggregateBlockT, p> *sharedRegion)
{
//todo: Make this work well with multiples chunks per block or check not to get several chunks or dragons ahoy!
__loadGhost<p>(blockLinId, sharedRegion);
}*/
/**
* Load the ghost layer of a data block into the boundary part of a shared memory region.
......@@ -370,9 +341,6 @@ public:
* @param coord The coordinate of the block.
* @param sharedRegionPtr The array of pointers to the shared memory regions, one for each property.
*/
// template<unsigned int ... props, typename CoordT>
// inline __device__ void storeBlock(const grid_key_dx<dim, CoordT> coord, void *sharedRegionPtr[sizeof...(props)]);
template<unsigned int ... props, typename AggrWrapperT>
inline __device__ void storeBlock(AggrWrapperT &block, void *sharedRegionPtr[sizeof...(props)])
{
......@@ -468,55 +436,8 @@ public:
}
private:
template<unsigned int edgeSize>
inline __device__ void linToCoordWithOffset(const unsigned int linId, const unsigned int offset, unsigned int (&coord)[dim]) const
{
unsigned int linIdTmp = linId;
for (unsigned int d = 0; d < dim; ++d)
{
coord[d] = linIdTmp % edgeSize;
coord[d] += offset;
linIdTmp /= edgeSize;
}
}
template<unsigned int edgeSize>
inline __device__ unsigned int coordToLin(const unsigned int (&coord)[dim], const unsigned int paddingSize = 0) const
{
unsigned int linId = coord[dim - 1];
for (int d = dim - 2; d >= 0; --d)
{
linId *= edgeSize + 2 * paddingSize;
linId += coord[d];
}
return linId;
}
template<unsigned int edgeSize, typename CoordT>
inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> &coord, const unsigned int paddingSize = 0) const
{
unsigned int linId = coord.get(dim - 1);
for (int d = dim - 2; d >= 0; --d)
{
linId *= edgeSize + 2 * paddingSize;
linId += coord.get(d);
}
return linId;
}
template <typename CoordT>
inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> & coord, grid_key_dx<dim, int> & blockDimensions) const
{
unsigned int linId = coord.get(dim - 1);
for (int d = dim - 2; d >= 0; --d)
{
linId *= blockDimensions.get(d);
linId += coord.get(d);
}
return linId;
}
template<unsigned int p, typename AggrWrapperT, typename SharedPtrT>
inline __device__ void
__loadBlock(const AggrWrapperT &block, SharedPtrT sharedRegionPtr)
......@@ -594,11 +515,20 @@ private:
inline __device__ void
__loadGhostBlock(const AggrWrapperT &block, const unsigned int blockId, SharedPtrT * sharedRegionPtr)
{
typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
loadGhostBlock_impl<ct_params::nLoop,dim,AggrWrapperT,p,ct_params,blockEdgeSize>::load(block,
sharedRegionPtr,
ghostLayerToThreadsMapping,
nn_blocks,
this->blockMap,
stencilSupportRadius,
ghostLayerSize,
blockId);
/* typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
const int pos = threadIdx.x % ghostLayerSize;
__shared__ int neighboursPos[nNN];
__shared__ int neighboursPos[ct_params::nNN];
const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
......@@ -624,9 +554,9 @@ private:
unsigned int coord[dim];
linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
const int linId2 = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
unsigned int nnb = nn_blocks.template get<0>(blockId*nNN + (threadIdx.x % nNN));
unsigned int nnb = nn_blocks.template get<0>(blockId*ct_params::nNN + (threadIdx.x % ct_params::nNN));
if (threadIdx.x < nNN)
if (threadIdx.x < ct_params::nNN)
{
neighboursPos[threadIdx.x] = nnb;
}
......@@ -643,7 +573,7 @@ private:
auto bdata = block.template get<p>()[threadIdx.x];
sharedRegionPtr[linId] = gdata;
sharedRegionPtr[linId2] = bdata;
sharedRegionPtr[linId2] = bdata;*/
}
template<unsigned int p, unsigned int ... props>
......
/*
* SparseGridGpu_ker_util.hpp
*
* Created on: Aug 7, 2019
* Author: i-bird
*/
#ifndef SPARSEGRIDGPU_KER_UTIL_HPP_
#define SPARSEGRIDGPU_KER_UTIL_HPP_
template<unsigned int edgeSize, unsigned int dim>
inline __device__ unsigned int coordToLin(const unsigned int (&coord)[dim], const unsigned int paddingSize = 0)
{
unsigned int linId = coord[dim - 1];
for (int d = dim - 2; d >= 0; --d)
{
linId *= edgeSize + 2 * paddingSize;
linId += coord[d];
}
return linId;
}
template<unsigned int edgeSize, typename CoordT, unsigned int dim>
inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> &coord, const unsigned int paddingSize = 0)
{
unsigned int linId = coord.get(dim - 1);
for (int d = dim - 2; d >= 0; --d)
{
linId *= edgeSize + 2 * paddingSize;
linId += coord.get(d);
}
return linId;
}
template <typename CoordT,unsigned int dim>
inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> & coord, grid_key_dx<dim, int> & blockDimensions)
{
unsigned int linId = coord.get(dim - 1);
for (int d = dim - 2; d >= 0; --d)
{
linId *= blockDimensions.get(d);
linId += coord.get(d);
}
return linId;
}
template<unsigned int edgeSize, unsigned int dim>
inline __device__ void linToCoordWithOffset(const unsigned int linId, const unsigned int offset, unsigned int (&coord)[dim])
{
unsigned int linIdTmp = linId;
for (unsigned int d = 0; d < dim; ++d)
{
coord[d] = linIdTmp % edgeSize;
coord[d] += offset;
linIdTmp /= edgeSize;
}
}
constexpr int gt = 0;
constexpr int nt = 1;
template<unsigned int nLoop, unsigned int dim, typename AggregateBlockT, unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl
{
template<typename AggrWrapperT, typename SharedPtrT, typename vector_type, typename vector_type2, typename blockMapType>
__device__ static inline void load(const AggrWrapperT &block,
SharedPtrT * sharedRegionPtr,
const vector_type & ghostLayerToThreadsMapping,
const vector_type2 & nn_blocks,
const blockMapType & blockMap,
unsigned int stencilSupportRadius,
unsigned int ghostLayerSize,
unsigned int blockId)
{
printf("Error to implement loadGhostBlock_impl with nLoop=%d \n",nLoop);
}
};
template<unsigned int dim, typename AggregateBlockT, unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl<1,dim,AggregateBlockT,p,ct_params,blockEdgeSize>
{
template<typename AggrWrapperT, typename SharedPtrT, typename vector_type, typename vector_type2, typename blockMapType>
__device__ static inline void load(const AggrWrapperT &block,
SharedPtrT * sharedRegionPtr,
const vector_type & ghostLayerToThreadsMapping,
const vector_type2 & nn_blocks,
const blockMapType & blockMap,
unsigned int stencilSupportRadius,
unsigned int ghostLayerSize,
unsigned int blockId)
{
typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
const int pos = threadIdx.x % ghostLayerSize;
__shared__ int neighboursPos[ct_params::nNN];
const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
// Convert pos into a linear id accounting for the inner domain offsets
const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
// Now get linear offset wrt the first element of the block
int ctr = linId;
unsigned int acc = 1;
unsigned int offset = 0;
for (int i = 0; i < dim; ++i)
{
int v = (ctr % edge) - stencilSupportRadius;
v = (v < 0)?(v + blockEdgeSize):v;
v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
offset += v*acc;
ctr /= edge;
acc *= blockEdgeSize;
}
// Convert pos into a linear id accounting for the ghost offsets
unsigned int coord[dim];
linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
const int linId2 = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
unsigned int nnb = nn_blocks.template get<0>(blockId*ct_params::nNN + (threadIdx.x % ct_params::nNN));
if (threadIdx.x < ct_params::nNN)
{
neighboursPos[threadIdx.x] = nnb;
}
__syncthreads();
// Actually load the data into the shared region
auto nPos = neighboursPos[neighbourNum];
auto gdata = blockMap.template get_ele<p>(nPos)[offset];
// Actually load the data into the shared region
//ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
auto bdata = block.template get<p>()[threadIdx.x];
sharedRegionPtr[linId] = gdata;
sharedRegionPtr[linId2] = bdata;
}
};
#endif /* SPARSEGRIDGPU_KER_UTIL_HPP_ */
......@@ -516,18 +516,6 @@ BOOST_AUTO_TEST_CASE(testStencilHeatZ)
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
// Initialize the grid
// sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
// insertConstantValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), 0);
// sparseGrid.flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
//
// sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
// dim3 sourcePt(gridSize.x * blockEdgeSize / 2, gridSize.y * blockEdgeSize / 2, 0);
// insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
// sparseGrid.flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
//
// sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
if (prePopulateGrid)
{
// Pre-populate grid
......@@ -597,26 +585,12 @@ BOOST_AUTO_TEST_CASE(testStencilHeatZ)
unsigned int iterations = 10;
unsigned int repetitions = 5;
bool prePopulateGrid = true;
// bool prePopulateGrid = true;
float timeInitAvg;
float timeStencilAvg;
float timeTotalAvg;
float timeInsertAvg = 0.0;
for (int rep=0; rep<repetitions; ++rep)
{
cudaEvent_t start, afterInit, stop;
float timeInit;
float timeStencil;
float timeTotal;
CUDA_SAFE_CALL(cudaEventCreate(&start));
CUDA_SAFE_CALL(cudaEventCreate(&afterInit));
CUDA_SAFE_CALL(cudaEventCreate(&stop));
CUDA_SAFE_CALL(cudaEventRecord(start, 0));
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(blockEdgeSize, blockEdgeSize);
dim3 blockSizeBlockedInsert(1, 1);
......@@ -625,69 +599,48 @@ BOOST_AUTO_TEST_CASE(testStencilHeatZ)
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
// Initialize the grid
// sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
// insertConstantValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), 0);
// sparseGrid.flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
//
// sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
// dim3 sourcePt(gridSize.x * blockEdgeSize / 2, gridSize.y * blockEdgeSize / 2, 0);
// insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
// sparseGrid.flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
//
// sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
if (prePopulateGrid)
for (unsigned int iter=0; iter<5; ++iter)
{
// Pre-populate grid
sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), 0, 0);
auto offset = 0;
sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
(sparseGrid.toKernel(), offset, offset);
sparseGrid.flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
cudaDeviceSynchronize();
///
}
CUDA_SAFE_CALL(cudaEventRecord(afterInit, 0));
CUDA_SAFE_CALL(cudaEventSynchronize(afterInit));
cudaDeviceSynchronize();
timer ts;
ts.start();
for (unsigned int iter=0; iter<iterations; ++iter)
{
// auto offset = iter * 99999 % 32003;
auto offset = 0;
sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
(sparseGrid.toKernel(), offset, offset);
sparseGrid.flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
cudaDeviceSynchronize();
}
CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
CUDA_SAFE_CALL(cudaEventSynchronize(stop));
CUDA_SAFE_CALL(cudaEventElapsedTime(&timeInit, start, afterInit));
CUDA_SAFE_CALL(cudaEventElapsedTime(&timeStencil, afterInit, stop));
CUDA_SAFE_CALL(cudaEventElapsedTime(&timeTotal, start, stop));
cudaDeviceSynchronize();
timeInitAvg += timeInit;
timeStencilAvg += timeStencil;
timeTotalAvg += timeTotal;
ts.stop();
timeInsertAvg += ts.getwct();
}
timeInitAvg /= repetitions;
timeStencilAvg /= repetitions;
timeTotalAvg /= repetitions;
timeInsertAvg /= repetitions;
// All times above are in ms
unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
float mElemS = numElements * iterations / (1e6 * timeStencilAvg/1000);
// float gFlopsS = gElemS * StencilT::flops;
float stencilSingleTimingMillis = timeStencilAvg/iterations;
float mElemS = numElements * iterations / (1e6 * timeInsertAvg);
float stencilInsertTimingMillis = timeInsertAvg/iterations;
printf("Test: %s\n", testName);
printf("Grid: %ux%u\n", gridEdgeSize*blockEdgeSize, gridEdgeSize*blockEdgeSize);
printf("Iterations: %u\n", iterations);
printf("Timing (avg on %u repetitions):\n\tInit: %f ms\n\tStencil: %f ms\n\tTotal: %f ms\n",
repetitions, timeInitAvg, timeStencilAvg, timeTotalAvg);
printf("Stencil details:\n\tSingle application timing: %f ms\n", stencilSingleTimingMillis);
std::cout << "Timing (avg on " << repetitions << " repetitions):" << std::endl;
std::cout << "\tInsert: " << stencilInsertTimingMillis << " ms" << std::endl;
// printf("Throughput:\n\t%f GElem/s\n\t%f GFlops/s\n", gElemS, gFlopsS);
printf("Throughput:\n\t%f MElem/s\n", mElemS);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment