Commit e9ac5c0d authored by incardon's avatar incardon
Browse files

Refactoring for TLS

parent dbc3439f
Pipeline #4114 failed with stages
in 9 minutes and 48 seconds
......@@ -366,6 +366,7 @@ install(FILES util/check_no_pointers.hpp
util/math_util_complex.hpp
util/mul_array_extents.hpp
util/hostDevice_util_funcs.hpp
util/sparsegrid_util_common.hpp
DESTINATION openfpm_data/include/util
COMPONENT OpenFPM)
......
......@@ -498,8 +498,8 @@ struct device_grid
* \snippet grid_unit_tests.hpp Access to an N-dimensional grid with an iterator
*
*/
template<unsigned int dim, typename T, typename S>
class grid_base<dim,T,S,typename memory_traits_inte<T>::type> : public grid_base_impl<dim,T,S, memory_traits_inte>
template<unsigned int dim, typename T, typename S, typename linearizer>
class grid_base<dim,T,S,typename memory_traits_inte<T>::type,linearizer> : public grid_base_impl<dim,T,S, memory_traits_inte,linearizer>
{
typedef typename apply_transform<memory_traits_inte,T>::type T_;
......@@ -512,14 +512,14 @@ public:
//! Object container for T, it is the return type of get_o it return a object type trough
// you can access all the properties of T
typedef typename grid_base_impl<dim,T,S, memory_traits_inte>::container container;
typedef typename grid_base_impl<dim,T,S, memory_traits_inte,linearizer>::container container;
//! linearizer type Z-morton Hilbert curve , normal striding
typedef typename grid_base_impl<dim,T,S, memory_traits_inte>::linearizer_type linearizer_type;
typedef typename grid_base_impl<dim,T,S, memory_traits_inte,linearizer>::linearizer_type linearizer_type;
//! Default constructor
inline grid_base() THROW
:grid_base_impl<dim,T,S,memory_traits_inte>()
:grid_base_impl<dim,T,S,memory_traits_inte,linearizer>()
{
}
......@@ -529,7 +529,7 @@ public:
*
*/
inline grid_base(const grid_base & g) THROW
:grid_base_impl<dim,T,S,memory_traits_inte>(g)
:grid_base_impl<dim,T,S,memory_traits_inte,linearizer>(g)
{
}
......@@ -539,7 +539,7 @@ public:
*
*/
inline grid_base(grid_base && g) THROW
:grid_base_impl<dim,T,S,memory_traits_inte>(g)
:grid_base_impl<dim,T,S,memory_traits_inte,linearizer>(g)
{
}
......@@ -549,13 +549,13 @@ public:
*
*/
inline grid_base(const size_t & sz) THROW
:grid_base_impl<dim,T,S,memory_traits_inte>(sz)
:grid_base_impl<dim,T,S,memory_traits_inte,linearizer>(sz)
{
}
//! Constructor allocate memory and give them a representation
inline grid_base(const size_t (& sz)[dim]) THROW
:grid_base_impl<dim,T,S,memory_traits_inte>(sz)
:grid_base_impl<dim,T,S,memory_traits_inte,linearizer>(sz)
{
}
......@@ -657,9 +657,9 @@ public:
* \return itself
*
*/
grid_base<dim,T,S,typename memory_traits_inte<T>::type> & operator=(const grid_base_impl<dim,T,S, memory_traits_inte> & base)
grid_base<dim,T,S,typename memory_traits_inte<T>::type,linearizer> & operator=(const grid_base_impl<dim,T,S, memory_traits_inte,linearizer> & base)
{
grid_base_impl<dim,T,S, memory_traits_inte>::operator=(base);
grid_base_impl<dim,T,S, memory_traits_inte,linearizer>::operator=(base);
return *this;
}
......@@ -669,9 +669,9 @@ public:
* \return itself
*
*/
grid_base<dim,T,S,typename memory_traits_inte<T>::type> & operator=(grid_base_impl<dim,T,S, memory_traits_inte> && base)
grid_base<dim,T,S,typename memory_traits_inte<T>::type,linearizer> & operator=(grid_base_impl<dim,T,S, memory_traits_inte,linearizer> && base)
{
grid_base_impl<dim,T,S, memory_traits_inte>::operator=(base);
grid_base_impl<dim,T,S, memory_traits_inte,linearizer>::operator=(base);
return *this;
}
......@@ -685,7 +685,7 @@ public:
};
//! short formula for a grid on gpu
template <unsigned int dim, typename T, typename linearizer = grid_sm<dim,void> > using grid_gpu = grid_base<dim,T,CudaMemory,typename memory_traits_inte<T>::type>;
template <unsigned int dim, typename T, typename linearizer = grid_sm<dim,void> > using grid_gpu = grid_base<dim,T,CudaMemory,typename memory_traits_inte<T>::type,linearizer>;
//! short formula for a grid on gpu
template <unsigned int dim, typename T, typename linearizer = grid_sm<dim,void> > using grid_cpu = grid_base<dim,T,HeapMemory,typename memory_traits_lin<T>::type,linearizer>;
......
......@@ -169,7 +169,7 @@ template<unsigned int dim ,typename T> class Point
* \return the i-coordinate of the point
*
*/
__device__ __host__ inline const T & get(int i) const
__device__ __host__ inline const T & get(unsigned int i) const
{
return boost::fusion::at_c<x>(data)[i];
}
......@@ -192,7 +192,7 @@ template<unsigned int dim ,typename T> class Point
* \return the i-coordinate of the point
*
*/
__device__ __host__ inline T& get(int i)
__device__ __host__ inline T& get(unsigned int i)
{
return boost::fusion::at_c<x>(data)[i];
}
......
......@@ -953,11 +953,11 @@ private:
void applyStencilInPlace(const Box<dim,int> & box, StencilMode & mode,Args... args)
{
// Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
auto & indexBuffer_ = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
auto & dataBuffer_ = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
unsigned int numScalars = indexBuffer.size() * dataChunkSize;
unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
if (numScalars == 0) return;
......@@ -970,16 +970,79 @@ private:
constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1
#ifdef CUDIFY_USE_CUDA
CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
<dim,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
stencil>),
threadGridSize, localThreadBlockSize,
box,
indexBuffer.toKernel(),
dataBuffer.toKernel(),
indexBuffer_.toKernel(),
dataBuffer_.toKernel(),
this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
args...);
#else
auto bx = box;
auto indexBuffer = indexBuffer_.toKernel();
auto dataBuffer = dataBuffer_.toKernel();
auto sparseGrid = this->template toKernelNN<stencil::stencil_type::nNN, nLoop>();
constexpr int pMask = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask;
auto lamb = [=] __device__ () mutable
{
constexpr unsigned int pIndex = 0;
typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
typedef typename decltype(dataBuffer)::value_type AggregateT_;
typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
constexpr unsigned int blockSize = MaskBlockT::size;
// NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
// since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
const unsigned int dataBlockPos = blockIdx.x;
const unsigned int offset = threadIdx.x;
if (dataBlockPos >= indexBuffer.size())
{
return;
}
auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
// todo: Add management of RED-BLACK stencil application! :)
const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
unsigned char curMask;
if (offset < blockSize)
{
// Read local mask to register
curMask = dataBlockLoad.template get<pMask>()[offset];
for (int i = 0 ; i < dim ; i++)
{curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
}
openfpm::sparse_index<unsigned int> sdataBlockPos;
sdataBlockPos.id = dataBlockPos;
stencil::stencil(
sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
curMask, args...);
};
CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
#endif
}
template <typename stencil, typename... Args>
......
......@@ -928,7 +928,6 @@ namespace SparseGridGpuKernels
nn_blocks.template get<0>(dataBlockPos*nNN_type::nNN + offset) = neighbourPos;
}
template <unsigned int dim,
unsigned int pMask,
typename stencil,
......@@ -937,7 +936,7 @@ namespace SparseGridGpuKernels
typename SparseGridT,
typename... Args>
__global__ void
applyStencilInPlaceNoShared(
applyStencilInPlace(
Box<dim,int> bx,
IndexBufT indexBuffer,
DataBufT dataBuffer,
......@@ -954,22 +953,19 @@ namespace SparseGridGpuKernels
typedef ScalarTypeOf<AggregateT, pMask> MaskT;
constexpr unsigned int blockSize = MaskBlockT::size;
int p = blockIdx.x * blockDim.x + threadIdx.x;
auto & pntBuff = sparseGrid.getPointBuffer();
// NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
// since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
const unsigned int dataBlockPos = blockIdx.x;
const unsigned int offset = threadIdx.x;
if (p >= pntBuff.size())
if (dataBlockPos >= indexBuffer.size())
{
return;
}
auto id = pntBuff.template get<0>(p);
const unsigned int dataBlockPos = id / blockSize;
const unsigned int offset = id % blockSize;
auto dataBlockLoad = dataBuffer.get(dataBlockPos);
auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
// todo: Add management of RED-BLACK stencil application! :)
const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
......@@ -979,7 +975,8 @@ namespace SparseGridGpuKernels
{
// Read local mask to register
curMask = dataBlockLoad.template get<pMask>()[offset];
if (bx.isInsideKey(pointCoord) == false) {curMask = 0;}
for (int i = 0 ; i < dim ; i++)
{curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
}
openfpm::sparse_index<unsigned int> sdataBlockPos;
......@@ -998,7 +995,7 @@ namespace SparseGridGpuKernels
typename SparseGridT,
typename... Args>
__global__ void
applyStencilInPlace(
applyStencilInPlaceNoShared(
Box<dim,int> bx,
IndexBufT indexBuffer,
DataBufT dataBuffer,
......@@ -1015,19 +1012,22 @@ namespace SparseGridGpuKernels
typedef ScalarTypeOf<AggregateT, pMask> MaskT;
constexpr unsigned int blockSize = MaskBlockT::size;
// NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
// since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
const unsigned int dataBlockPos = blockIdx.x;
const unsigned int offset = threadIdx.x;
int p = blockIdx.x * blockDim.x + threadIdx.x;
if (dataBlockPos >= indexBuffer.size())
auto & pntBuff = sparseGrid.getPointBuffer();
if (p >= pntBuff.size())
{
return;
}
auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
auto id = pntBuff.template get<0>(p);
const unsigned int dataBlockPos = id / blockSize;
const unsigned int offset = id % blockSize;
auto dataBlockLoad = dataBuffer.get(dataBlockPos);
// todo: Add management of RED-BLACK stencil application! :)
const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
......@@ -1037,8 +1037,7 @@ namespace SparseGridGpuKernels
{
// Read local mask to register
curMask = dataBlockLoad.template get<pMask>()[offset];
for (int i = 0 ; i < dim ; i++)
{curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
if (bx.isInsideKey(pointCoord) == false) {curMask = 0;}
}
openfpm::sparse_index<unsigned int> sdataBlockPos;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment