Commit 6ae61146 authored by incardon's avatar incardon

Fixing SparseGridGPU NN block index

parent f4f307e3
......@@ -415,7 +415,8 @@ __global__ void count_nn_cells(cl_sparse_type cl_sparse, vector_type output, vec
typedef typename cl_sparse_type::index_type index_type;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
openfpm::sparse_index<index_type> id(tid);
openfpm::sparse_index<index_type> id;
id.id = tid;
if (tid >= cl_sparse.size()) {return;}
......@@ -441,7 +442,8 @@ __global__ void fill_nn_cells(cl_sparse_type cl_sparse, vector_type starts, vect
typedef typename cl_sparse_type::index_type index_type;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
openfpm::sparse_index<index_type> id(tid);
openfpm::sparse_index<index_type> id;
id.id = tid;
if (tid >= cl_sparse.size()) {return;}
......@@ -463,7 +465,11 @@ __global__ void fill_nn_cells(cl_sparse_type cl_sparse, vector_type starts, vect
if (sid.id == cl_sparse.size() - 1)
{output.template get<1>(starts.template get<0>(tid) + cnt) = max_stop;}
else
{output.template get<1>(starts.template get<0>(tid) + cnt) = cl_sparse.template get<0>(decltype(sid)(sid.id+1));}
{
decltype(sid) sid_t;
sid_t.id = sid.id+1;
output.template get<1>(starts.template get<0>(tid) + cnt) = cl_sparse.template get<0>(sid_t);
}
++cnt;
}
}
......
......@@ -188,20 +188,20 @@ namespace BlockMapGpuKernels
// GridSize = number of segments
// BlockSize = chunksPerBlock * chunkSize
//
/* template<unsigned int p,
template<unsigned int p,
unsigned int pSegment,
unsigned int pMask,
unsigned int chunksPerBlock,
typename op,
typename IndexVectorT, typename DataVectorT, typename MaskVectorT>
typename IndexVectorT, typename DataVectorT>
__global__ void
segreduce_total(
DataVectorT data,
DataVectorT data_new,
DataVectorT data_old,
IndexVectorT segments_data,
IndexVectorT segments_dataMap,
IndexVectorT segments_dataOld,
IndexVectorT outputMap,
MaskVectorT masks,
DataVectorT output
)
{
......@@ -213,10 +213,10 @@ namespace BlockMapGpuKernels
constexpr unsigned int chunkSize = BaseBlockType::size;
unsigned int segmentId = blockIdx.x;
int segmentSize = segments.template get<pSegment>(segmentId + 1)
- segments.template get<pSegment>(segmentId);
int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
- segments_data.template get<pSegment>(segmentId);
unsigned int start = segments.template get<pSegment>(segmentId);
unsigned int start = segments_data.template get<pSegment>(segmentId);
unsigned int chunkId = threadIdx.x / chunkSize;
unsigned int offset = threadIdx.x % chunkSize;
......@@ -230,41 +230,56 @@ namespace BlockMapGpuKernels
if (chunkId < segmentSize)
{
unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
A[chunkId][offset] = RhsBlockWrapper<DataType>(data.template get<p>(m_chunkId), offset).value;
aMask = masks.template get<pMask>(m_chunkId)[offset];
A[chunkId][offset] = RhsBlockWrapper<DataType>(data_new.template get<p>(m_chunkId), offset).value;
aMask = data_new.template get<pMask>(m_chunkId)[offset];
}
//////////////////////////// Horizontal reduction (work on data) //////////////////////////////////////////////////
//////////////////////////// We reduce
int i = chunksPerBlock;
for (; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
{
unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
// it breg = data_new.template get<p>(m_chunkId)
generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
data.template get<p>(m_chunkId),
data_new.template get<p>(m_chunkId),
offset);
bMask = masks.template get<pMask>(m_chunkId)[offset];
bMask = data_new.template get<pMask>(m_chunkId)[offset];
// it reduce A[chunkId][offset] with breg
generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
bReg,
BlockMapGpu_ker<>::exist(aMask),
BlockMapGpu_ker<>::exist(bMask));
// reduce aMask with bMask
aMask = aMask | bMask;
}
if (i + chunkId < segmentSize)
{
unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
// it breg = data_new.template get<p>(m_chunkId)
generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
data.template get<p>(m_chunkId),
data_new.template get<p>(m_chunkId),
offset);
bMask = masks.template get<pMask>(m_chunkId)[offset];
bMask = data_new.template get<pMask>(m_chunkId)[offset];
// it reduce A[chunkId][offset] with breg
generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
bReg,
BlockMapGpu_ker<>::exist(aMask),
BlockMapGpu_ker<>::exist(bMask));
// reduce aMask with bMask
aMask = aMask | bMask;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
AMask[chunkId][offset] = aMask;
__syncthreads();
......@@ -292,6 +307,19 @@ namespace BlockMapGpuKernels
//////////////////////////////////////// Reduce now with old data if present link
int seg_old = segments_dataOld.template get<0>(segmentId);
if (seg_old != -1)
{
aMask = AMask[0][offset];
bMask = data_old.template get<pMask>(seg_old)[offset];
generalDimensionFunctor<DataType>::template applyOp<op>(A[0][offset],
data_old.template get<p>(seg_old)[offset],
BlockMapGpu_ker<>::exist(aMask),
BlockMapGpu_ker<>::exist(bMask));
AMask[0][offset] = aMask | bMask;
}
///////////////////////////////////////////////////////////////////////////////////
// Write output
......@@ -304,7 +332,7 @@ namespace BlockMapGpuKernels
#else // __NVCC__
std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
#endif // __NVCC__
}*/
}
......
......@@ -293,7 +293,7 @@ public:
template<unsigned int p, typename AggrWrapperT>
inline __device__ void
loadGhostBlock(const AggrWrapperT & dataBlockLoad, const unsigned int blockLinId, ScalarTypeOf<AggregateBlockT, p> *sharedRegion)
loadGhostBlock(const AggrWrapperT & dataBlockLoad, const openfpm::sparse_index<unsigned int> blockLinId, ScalarTypeOf<AggregateBlockT, p> *sharedRegion)
{
__loadGhostBlock<p>(dataBlockLoad,blockLinId, sharedRegion);
}
......@@ -554,7 +554,7 @@ private:
*/
template<unsigned int p, typename AggrWrapperT ,typename SharedPtrT>
inline __device__ void
__loadGhostBlock(const AggrWrapperT &block, const unsigned int blockId, SharedPtrT * sharedRegionPtr)
__loadGhostBlock(const AggrWrapperT &block, const openfpm::sparse_index<unsigned int> blockId, SharedPtrT * sharedRegionPtr)
{
loadGhostBlock_impl<ct_params::nLoop,dim,AggrWrapperT,p,ct_params,blockEdgeSize>::load(block,
sharedRegionPtr,
......@@ -563,7 +563,7 @@ private:
this->blockMap,
stencilSupportRadius,
ghostLayerSize,
blockId);
blockId.id);
}
template<unsigned int p, unsigned int ... props>
......
......@@ -73,7 +73,7 @@ struct loadGhostBlock_impl
const blockMapType & blockMap,
unsigned int stencilSupportRadius,
unsigned int ghostLayerSize,
unsigned int blockId)
const unsigned int blockId)
{
printf("Error to implement loadGhostBlock_impl with nLoop=%d \n",nLoop);
}
......@@ -90,7 +90,7 @@ struct loadGhostBlock_impl<1,dim,AggregateBlockT,p,ct_params,blockEdgeSize>
const blockMapType & blockMap,
unsigned int stencilSupportRadius,
unsigned int ghostLayerSize,
unsigned int blockId)
const unsigned int blockIdPos)
{
typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;
......@@ -122,7 +122,8 @@ struct loadGhostBlock_impl<1,dim,AggregateBlockT,p,ct_params,blockEdgeSize>
unsigned int coord[dim];
linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
const int linId2 = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
unsigned int nnb = nn_blocks.template get<0>(blockId*ct_params::nNN + (threadIdx.x % ct_params::nNN));
unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
if (threadIdx.x < ct_params::nNN)
{
......
......@@ -51,7 +51,15 @@ namespace SparseGridGpuKernels
const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
auto dataBlock = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
sparseGrid.loadGhostBlock<pMask>(dataBlock,dataBlockId,enlargedBlock);
if (dataBlockId < 0)
{
printf("Negative Datablock \n");
return;
}
openfpm::sparse_index<unsigned int> sdataBlockPos;
sdataBlockPos.id = dataBlockPos;
sparseGrid.loadGhostBlock<pMask>(dataBlock,sdataBlockPos,enlargedBlock);
__syncthreads();
......@@ -173,8 +181,11 @@ namespace SparseGridGpuKernels
applyStencilHere = false;
}
openfpm::sparse_index<unsigned int> sdataBlockPos;
sdataBlockPos.id = dataBlockPos;
stencil::stencil(
sparseGrid, dataBlockId, offset, pointCoord, dataBlockLoad, dataBlockLoad,
sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
applyStencilHere, args...);
}
......@@ -238,8 +249,11 @@ namespace SparseGridGpuKernels
applyStencilHere = false;
}
openfpm::sparse_index<unsigned int> sdataBlockId;
sdataBlockId.id = dataBlockPos;
stencil::stencil(
sparseGrid, dataBlockId, offset, pointCoord, dataBlockLoad, dataBlockStore,
sparseGrid, dataBlockId, sdataBlockId, offset, pointCoord, dataBlockLoad, dataBlockStore,
applyStencilHere, args...);
sparseGrid.setExist(dataBlockStore.template get<pMask>()[offset]);
......
......@@ -223,6 +223,7 @@ struct HeatStencil
static inline __device__ void stencil(
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
const openfpm::sparse_index<unsigned int> dataBlockIdPos,
unsigned int offset,
const grid_key_dx<dim, int> & pointCoord,
const DataBlockWrapperT & dataBlockLoad,
......@@ -238,7 +239,7 @@ struct HeatStencil
__shared__ ScalarT enlargedBlock[enlargedBlockSize];
sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockId, enlargedBlock);
sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockIdPos, enlargedBlock);
__syncthreads();
......
......@@ -89,6 +89,157 @@ BOOST_AUTO_TEST_CASE (testSegreduce)
}
}
BOOST_AUTO_TEST_CASE (testSegreduce_total)
{
typedef float ScalarT;
typedef DataBlock<unsigned char, 64> MaskBlockT;
typedef DataBlock<ScalarT, 64> BlockT;
openfpm::vector_gpu<aggregate<int>> segments;
openfpm::vector_gpu<aggregate<int>> segment_dataMap;
openfpm::vector_gpu<aggregate<int>> outputMap;
openfpm::vector_gpu<aggregate<int>> segments_oldData;
// segment reduction for the new added data
segments.resize(8);
segments.template get<0>(0) = 0;
segments.template get<0>(1) = 4;
segments.template get<0>(2) = 5;
segments.template get<0>(3) = 7;
segments.template get<0>(4) = 8;
segments.template get<0>(5) = 11;
segments.template get<0>(6) = 17;
segments.template get<0>(7) = 18; // Id of first non-existent data
// segment old data indicate if new data has also old data
// to be reduced with
segments_oldData.resize(8);
segments_oldData.template get<0>(0) = -1;
segments_oldData.template get<0>(1) = 2;
segments_oldData.template get<0>(2) = -1;
segments_oldData.template get<0>(3) = 5;
segments_oldData.template get<0>(4) = 7;
segments_oldData.template get<0>(5) = -1;
segments_oldData.template get<0>(6) = -1;
segments_oldData.template get<0>(7) = -1; // Id of first non-existent data
// for each added index we have a chunk in the vct_add_data vector
// undortunately is not
segment_dataMap.resize(20);
segment_dataMap.template get<0>(0) = 10;
segment_dataMap.template get<0>(1) = 1;
segment_dataMap.template get<0>(2) = 50;
segment_dataMap.template get<0>(3) = 11;
segment_dataMap.template get<0>(4) = 13;
segment_dataMap.template get<0>(5) = 87;
segment_dataMap.template get<0>(6) = 54;
segment_dataMap.template get<0>(7) = 33;
segment_dataMap.template get<0>(8) = 22;
segment_dataMap.template get<0>(9) = 17;
segment_dataMap.template get<0>(10) = 40;
segment_dataMap.template get<0>(11) = 32;
segment_dataMap.template get<0>(12) = 80;
segment_dataMap.template get<0>(13) = 52;
segment_dataMap.template get<0>(14) = 21;
segment_dataMap.template get<0>(15) = 76;
segment_dataMap.template get<0>(16) = 65;
segment_dataMap.template get<0>(17) = 54;
segment_dataMap.template get<0>(18) = 3;
outputMap.resize(15);
outputMap.template get<0>(0) = 9;
outputMap.template get<0>(1) = 11;
outputMap.template get<0>(2) = 13;
outputMap.template get<0>(3) = 34;
outputMap.template get<0>(4) = 23;
outputMap.template get<0>(5) = 90;
outputMap.template get<0>(6) = 21;
outputMap.template get<0>(7) = 11;
outputMap.template get<0>(8) = 15;
segments.template hostToDevice<0>();
segment_dataMap.hostToDevice<0>();
segments_oldData.hostToDevice<0>();
outputMap.hostToDevice<0>();
const unsigned int BITMASK = 0, BLOCK = 1;
BlockT block;
MaskBlockT mask;
BlockT block_old;
MaskBlockT mask_old;
for (int i = 0; i < 32; ++i)
{
block[i] = i + 1;
mask[i] = 1;
block_old[i] = 100 + i + 1;
mask_old[i] = 1;
}
for (int i = 32; i < 64; ++i)
{
block[i] = 666;
mask[i] = 0;
block_old[i] = 666;
mask_old[i] = 0;
}
block_old[33] = 665;
mask_old[33] = 1;
openfpm::vector_gpu<aggregate<MaskBlockT, BlockT>> data_old;
openfpm::vector_gpu<aggregate<MaskBlockT, BlockT>> data_new;
data_new.resize(100);
data_old.resize(100);
for (int i = 0; i < 100; ++i)
{
data_new.template get<BITMASK>(i) = mask;
data_new.template get<BLOCK>(i) = block;
if (i < data_old.size())
{
data_old.template get<BITMASK>(i) = mask_old;
data_old.template get<BLOCK>(i) = block_old;
}
}
data_new.template hostToDevice<BITMASK, BLOCK>();
data_old.template hostToDevice<BITMASK, BLOCK>();
// Allocate output buffer
openfpm::vector_gpu<aggregate<MaskBlockT, BlockT>> outputData;
outputData.resize(100);
CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, mgpu::plus_t<ScalarT>>),segments.size()-1, 2*BlockT::size,
data_new.toKernel(),
data_old.toKernel(),
segments.toKernel(),
segment_dataMap.toKernel(),
segments_oldData.toKernel(),
outputMap.toKernel(),
outputData.toKernel());
// Segreduce on mask
CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, mgpu::maximum_t<unsigned char>>),segments.size()-1, 2*BlockT::size,
data_new.toKernel(),
data_old.toKernel(),
segments.toKernel(),
segment_dataMap.toKernel(),
segments_oldData.toKernel(),
outputMap.toKernel(),
outputData.toKernel());
/* outputData.template deviceToHost<BITMASK, BLOCK>();
for (int j = 0; j < outputData.size(); ++j)
{
BlockT outBlock = outputData.template get<BLOCK>(j);
MaskBlockT outMask = outputData.template get<BITMASK>(j);
for (int i = 0; i < BlockT::size; ++i)
{
std::cout << outBlock[i] << ":" << (int)outMask[i] << " ";
}
std::cout << std::endl;
}*/
}
BOOST_AUTO_TEST_CASE (testSegreduce_vector)
{
typedef float ScalarT;
......
......@@ -315,6 +315,7 @@ struct HeatStencil
static inline __device__ void stencil(
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
openfpm::sparse_index<unsigned int> dataBlockIdPos,
unsigned int offset,
grid_key_dx<dim, int> & pointCoord,
DataBlockWrapperT & dataBlockLoad,
......@@ -332,7 +333,7 @@ struct HeatStencil
// sparseGrid.loadBlock<p>(dataBlockLoad, enlargedBlock);
// sparseGrid.loadGhost<p>(dataBlockId, enlargedBlock);
sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockId,enlargedBlock);
sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockIdPos,enlargedBlock);
// sparseGrid.loadGhost<p>(dataBlockId, nullptr, enlargedBlock);
__syncthreads();
......@@ -822,9 +823,6 @@ BOOST_AUTO_TEST_CASE(testStencilHeat)
// // Now tag the boundaries
sparseGrid.tagBoundaries();
sparseGrid.deviceToHost<0>();
sparseGrid.write("test_heat_stencil.vtk");
// Now apply the laplacian operator
const unsigned int maxIter = 1000;
// const unsigned int maxIter = 10;
......@@ -834,6 +832,9 @@ BOOST_AUTO_TEST_CASE(testStencilHeat)
sparseGrid.applyStencils<HeatStencil<dim, 1,0>>(STENCIL_MODE_INPLACE, 0.1);
}
sparseGrid.deviceToHost<0>();
sparseGrid.write("test_heat_stencil.vtk");
// Get output
openfpm::vector_gpu<AggregateT> output;
output.resize(4 * 64);
......@@ -1022,7 +1023,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput)
sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
// sparseGrid.tagBoundaries();
sparseGrid.tagBoundaries();
sparseGrid.template deviceToHost<0>();
......
......@@ -31,10 +31,6 @@ namespace openfpm
struct sparse_index
{
index_type id;
__device__ sparse_index(index_type id)
:id(id)
{}
};
static __shared__ int vct_atomic_add;
......@@ -186,7 +182,8 @@ namespace openfpm
{
Ti di;
this->_branchfree_search(id,di);
openfpm::sparse_index<Ti> sid(di);
openfpm::sparse_index<Ti> sid;
sid.id = di;
return sid;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment