...
 
Commits (11)
......@@ -17,8 +17,9 @@ if (CUDA_FOUND)
SparseGridGpu/BlockMapGpu.hpp
SparseGridGpu/BlockMapGpu_ker.cuh
SparseGridGpu/tests/BlockMapGpu_tests.cu
util/test/zmorton_unit_tests.cpp
SparseGridGpu/DataBlock.cuh SparseGridGpu/BlockMapGpu_kernels.cuh
util/cuda/segreduce_block_cuda.cuh util/cuda/test/segreduce_block_cuda_tests.cu SparseGridGpu/BlockMapGpu_dimensionalityWrappers.cuh SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu SparseGridGpu/SparseGridGpu.hpp SparseGridGpu/tests/SparseGridGpu_tests.cu SparseGridGpu/SparseGridGpu_ker.cuh SparseGridGpu/SparseGridGpu_kernels.cuh)
util/cuda/segreduce_block_cuda.cuh util/cuda/test/segreduce_block_cuda_tests.cu SparseGridGpu/BlockMapGpu_dimensionalityWrappers.cuh SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu SparseGridGpu/SparseGridGpu.hpp SparseGridGpu/tests/SparseGridGpu_tests.cu SparseGridGpu/SparseGridGpu_ker.cuh SparseGridGpu/SparseGridGpu_kernels.cuh SparseGridGpu/BlockCache/BlockCache.cuh SparseGridGpu/tests/Stencil_performance_tests.cu )
else ()
set(CUDA_SOURCES)
endif ()
......@@ -34,8 +35,7 @@ add_executable(mem_map ../../openfpm_devices/src/Memleak_check.cpp
Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
../../openfpm_devices/src/memory/HeapMemory.cpp
../../openfpm_devices/src/memory/PtrMemory.cpp
SparseGridGpu/Geometry/BlockGeometry.hpp
SparseGridGpu/Geometry/tests/BlockGeometry_tests.cpp)
SparseGridGpu/Geometry/tests/grid_smb_tests.cpp)
if (CMAKE_COMPILER_IS_GNUCC)
target_compile_options(mem_map PRIVATE "-Wno-deprecated-declarations")
......@@ -51,7 +51,10 @@ if (TEST_PERFORMANCE)
endif ()
if (CUDA_FOUND)
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -arch=sm_61 -g -lineinfo >)
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
......
......@@ -702,7 +702,8 @@ public:
* \return The reference of the data
*
*/
template <unsigned int p> __device__ __host__ auto get() -> decltype(boost::fusion::at_c<p>(data).mem_r.operator[](k))
template <unsigned int p>
__device__ __host__ auto get() -> decltype(boost::fusion::at_c<p>(data).mem_r.operator[](k))
{
return boost::fusion::at_c<p>(data).mem_r.operator[](k);
}
......@@ -719,6 +720,13 @@ public:
return boost::fusion::at_c<p>(data).mem_r.operator[](k);
}
// __device__ __host__ encapc(const encapc<dim,T,Mem> & ec) = delete;
__device__ __host__ encapc(const encapc<dim,T,Mem> & ec) : data(ec.data), k(ec.k)
{
// printf("ciao\n");
}
// __device__ __host__ inline encapc<dim,T,Mem> & operator=(const encapc<dim,T,Mem> & ec) = delete; //DEBUG
/*! \brief Assignment
*
* \param ec encapsulator
......@@ -766,6 +774,23 @@ public:
return *this;
}
__device__ __host__ inline void private_set_data_k(Mem & data_c, size_t k)
{
this->data = data;
this->k = k;
}
__device__ __host__ inline Mem & private_get_data()
{
return data;
}
__device__ __host__ inline size_t private_get_k()
{
return k;
}
};
#include "util/common.hpp"
......
......@@ -14,8 +14,9 @@
#include "util/create_vmpl_sequence.hpp"
#include "util/cuda/cuda_launch.hpp"
#define DATA_ON_HOST 32
#define DATA_ON_DEVICE 64
constexpr int DATA_ON_HOST = 32;
constexpr int DATA_ON_DEVICE = 64;
constexpr int EXACT_RESIZE = 128;
template<bool np,typename T>
struct skip_init
......
......@@ -454,7 +454,7 @@ __global__ void fill_nn_cells(cl_sparse_type cl_sparse, vector_type starts, vect
auto sid = cl_sparse.get_sparse(cell_n);
if (sid.id != (index_type)-1)
if (sid.id != (index_type)cl_sparse.size())
{
index_type start = cl_sparse.template get<0>(sid);
// Cell exist
......
......@@ -241,7 +241,7 @@ template<unsigned int dim ,typename T> class Point
* \return the distance
*
*/
__device__ __host__ T distance(const Point<dim,T> & q)
__device__ __host__ T distance(const Point<dim,T> & q) const
{
T tot = 0.0;
......@@ -260,7 +260,7 @@ template<unsigned int dim ,typename T> class Point
* \return the square of the distance
*
*/
T distance2(const Point<dim,T> & q)
T distance2(const Point<dim,T> & q) const
{
T tot = 0.0;
......
//
// Created by tommaso on 27/06/19.
//
#ifndef OPENFPM_PDATA_BLOCKCACHE_CUH
#define OPENFPM_PDATA_BLOCKCACHE_CUH
#include "util/cuda_util.hpp"
namespace BlockCacheUtils
{
template <typename T>
inline __device__ __host__ unsigned int SetToZeroIfFalse<true, T>(T value)
{
return value;
};
template <typename T>
inline __device__ __host__ unsigned int SetToZeroIfFalse<false, T>(T value)
{
return 0;
};
}
/**
* BlockCache is an abstraction built on the concept of loading a block into shared
* memory before using it in a stencil operation.
* The idea is to provide a way to transparently address shared and global memory via coordinates,
* caching the block data into shared memory but also allowing addressing non-cached data directly in global.
*/
template <typename SparseGridT, unsigned int chunksPerBlock, bool loadGhostInSharedMemory, unsigned int ... props>
struct BlockCache
{
static void
};
#endif //OPENFPM_PDATA_BLOCKCACHE_CUH
......@@ -75,6 +75,13 @@ public:
template<unsigned int ... prp>
void deviceToHost();
void deviceToHost();
template<unsigned int ... prp>
void hostToDevice();
void hostToDevice();
void setGPUInsertBuffer(int nBlock, int nSlot);
void initializeGPUInsertBuffer();
......@@ -114,6 +121,12 @@ public:
{
setBit(bitMask, EXIST_BIT);
}
template<typename BitMaskT>
inline static void unsetExist(BitMaskT &bitMask)
{
unsetBit(bitMask, EXIST_BIT);
}
};
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
......@@ -127,6 +140,7 @@ BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::get(unsigned
auto aggregate = blockMap.get(blockId);
auto &block = aggregate.template get<p>();
auto &mask = aggregate.template get<pMask>();
// Now check if the element actually exists
if (exist(mask[offset]))
{
return block[offset];
......@@ -159,11 +173,34 @@ void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::deviceT
blockMap.template deviceToHost<prp..., pMask>();
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::deviceToHost()
{
blockMap.template deviceToHost<pMask>();
/////////////// DEBUG ////////////////////
auto indexBuffer = blockMap.getIndexBuffer();
//////////////////////////////////////////
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
template<unsigned int ... prp>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::hostToDevice()
{
blockMap.template hostToDevice<prp..., pMask>();
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::hostToDevice()
{
blockMap.template hostToDevice<pMask>();
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::setGPUInsertBuffer(int nBlock, int nSlot)
{
// Prealloc the insert buffer on the underlying sparse vector
blockMap.setGPUInsertBuffer(nBlock, nSlot);
initializeGPUInsertBuffer();
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
......@@ -172,7 +209,6 @@ void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::initial
//todo: Test if it's enough to just initialize masks to 0, without any background value
// Initialize the blocks to background
auto & insertBuffer = blockMap.getGPUInsertBuffer();
std::cout << "initializeGPUInsertBuffer :: insertBuffer.size() = " << insertBuffer.size() << std::endl; //debug
typedef BlockTypeOf<AggregateInternalT, pMask> BlockType; // Here assuming that all block types in the aggregate have the same size!
constexpr unsigned int chunksPerBlock = threadBlockSize / BlockType::size; // Floor is good here...
BlockMapGpuKernels::initializeInsertBuffer<pMask, chunksPerBlock> <<< insertBuffer.size()/chunksPerBlock, chunksPerBlock*BlockType::size >>>(
......@@ -183,7 +219,7 @@ template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT
template<typename ... v_reduce>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::flush(mgpu::ofp_context_t &context, flush_type opt)
{
blockMap.template flush<v_reduce ..., sBitwiseOr_<pMask>>(context, opt); // This is the one to use ideally
blockMap.template flush<v_reduce ..., sBitwiseOr_<pMask>>(context, opt);
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
......
This diff is collapsed.
......@@ -516,7 +516,7 @@ namespace BlockMapGpuFunctors
// First ensure we have the right sizes on the buffers
tmpData.resize(mergeIndices.size()); //todo: check if we need some other action to actually have the right space on gpu
tmpData.resize(mergeIndices.size(),EXACT_RESIZE); //todo: check if we need some other action to actually have the right space on gpu
// Phase 0 - merge data
BlockMapGpuKernels::mergeData<<< gridSize, blockSize >>>(dataOld.toKernel(),
......
......@@ -35,7 +35,7 @@ struct DataBlock
#endif // __CUDA_ARCH__
}
__device__ __host__ DataBlock operator=(const DataBlock &other)
__device__ __host__ inline DataBlock & operator=(const DataBlock &other)
{
#ifdef __CUDA_ARCH__
#ifdef __NVCC__
......@@ -47,7 +47,7 @@ struct DataBlock
return *this;
}
__device__ __host__ DataBlock operator=(ScalarT v)
__device__ __host__ inline DataBlock & operator=(ScalarT v)
{
#ifdef __CUDA_ARCH__
#ifdef __NVCC__
......
......@@ -8,18 +8,7 @@
#include <boost/mpl/size_t.hpp>
#include <cstring>
#include <Grid/grid_sm.hpp>
template<unsigned int base, unsigned int exponent>
struct IntPow
{
constexpr static size_t value = base * IntPow<base, exponent - 1>::value;
};
template<unsigned int base>
struct IntPow<base, 0>
{
constexpr static size_t value = 1;
};
#include "SparseGridGpu/TemplateUtils/mathUtils.hpp"
/**
* This class provides an interface to linearization of coordinates and viceversa when blocks are involved.
......@@ -27,27 +16,40 @@ struct IntPow<base, 0>
* tuned for blocked data.
*/
template<unsigned int dim, unsigned int blockEdgeSize>
class BlockGeometry
class grid_smb
{
private:
constexpr static size_t blockSize = IntPow<blockEdgeSize, dim>::value;
size_t blockSz[dim];
size_t sz[dim];
protected:
constexpr static size_t blockSize = IntPow<blockEdgeSize, dim>::value;
public:
BlockGeometry() {}
grid_smb() {}
__host__ __device__ BlockGeometry(const size_t blockDimensions[dim])
__host__ __device__ grid_smb(const size_t (& sz)[dim])
{
for (int d=0; d<dim; ++d)
{
this->sz[d] = sz[d];
blockSz[d] = sz[d] / blockEdgeSize + ((sz[d] % blockEdgeSize) != 0);
}
}
/* __host__ __device__ grid_smb(const size_t blockDimensions[dim])
{
memcpy(blockSz, blockDimensions, dim * sizeof(size_t));
for (int d=0; d<dim; ++d)
{
sz[d] = blockDimensions[d] * blockEdgeSize;
}
}
}*/
__host__ __device__ BlockGeometry(const size_t domainBlockEdgeSize)
__host__ __device__ grid_smb(const size_t domainBlockEdgeSize)
{
for (int i = 0; i < dim; ++i)
{
......@@ -57,7 +59,7 @@ public:
}
template<typename T>
__host__ __device__ BlockGeometry(const grid_sm<dim, T> blockGrid)
__host__ __device__ grid_smb(const grid_sm<dim, T> blockGrid)
{
for (int i = 0; i < dim; ++i)
{
......@@ -68,7 +70,7 @@ public:
#ifdef __NVCC__
//Constructors from dim3 and uint3 objects
__host__ __device__ BlockGeometry(const dim3 blockDimensions)
__host__ __device__ grid_smb(const dim3 blockDimensions)
{
unsigned int i = 0;
assert(dim <= 3);
......@@ -88,36 +90,22 @@ public:
}
}
/* __host__ __device__ BlockGeometry(const uint3 blockDimensions)
{
assert(dim <= 3);
blockSz[0] = blockDimensions.x;
sz[i] = blockSz[0] * blockEdgeSize;
if (dim > 1)
{
blockSz[1] = blockDimensions.y;
if (dim > 2)
{
blockSz[2] = blockDimensions.z;
}
}
}*/
#endif // __NVCC__
__host__ __device__ BlockGeometry(const BlockGeometry<dim, blockEdgeSize> &other)
__host__ __device__ grid_smb(const grid_smb<dim, blockEdgeSize> &other)
{
memcpy(blockSz, other.blockSz, dim * sizeof(size_t));
memcpy(sz, other.sz, dim * sizeof(size_t));
}
__host__ __device__ BlockGeometry &operator=(const BlockGeometry<dim, blockEdgeSize> &other)
__host__ __device__ grid_smb &operator=(const grid_smb<dim, blockEdgeSize> &other)
{
if (&other != this)
{
memcpy(blockSz, other.blockSz, dim * sizeof(size_t));
memcpy(sz, other.sz, dim * sizeof(size_t));
}
for (size_t i = 0 ; i < dim ; i++)
{
blockSz[i] = other.blockSz[i];
sz[i] = other.sz[i];
}
return *this;
}
......@@ -129,6 +117,7 @@ public:
template<typename indexT>
inline __host__ __device__ mem_id LinId(const grid_key_dx<dim, indexT> coord) const
{
//todo: Check (in debug mode only) that the coordinates passed here are valid and not overflowing dimensions (???)
mem_id blockLinId = coord.get(dim - 1) / blockEdgeSize;
mem_id localLinId = coord.get(dim - 1) % blockEdgeSize;
for (int d = dim - 2; d >= 0; --d)
......@@ -141,16 +130,16 @@ public:
return blockLinId * blockSize + localLinId;
}
inline __host__ __device__ grid_key_dx<dim> InvLinId(const mem_id linId) const
inline __host__ __device__ grid_key_dx<dim, int> InvLinId(const mem_id linId) const
{
mem_id blockLinId = linId / blockSize;
mem_id localLinId = linId % blockSize;
return InvLinId(blockLinId, localLinId);
}
inline __host__ __device__ grid_key_dx<dim> InvLinId(mem_id blockLinId, mem_id localLinId) const
inline __host__ __device__ grid_key_dx<dim, int> InvLinId(mem_id blockLinId, mem_id localLinId) const
{
grid_key_dx<dim> coord;
grid_key_dx<dim, int> coord;
for (int d = 0; d < dim; ++d)
{
auto c = blockLinId % blockSz[d];
......@@ -168,17 +157,26 @@ public:
inline __host__ __device__ mem_id BlockLinId(const grid_key_dx<dim, indexT> blockCoord) const
{
mem_id blockLinId = blockCoord.get(dim - 1);
if (blockLinId >= blockSz[dim-1])
{
return -1;
}
for (int d = dim - 2; d >= 0; --d)
{
blockLinId *= blockSz[d];
blockLinId += blockCoord.get(d);
mem_id cur = blockCoord.get(d);
if (cur >= blockSz[d])
{
return -1;
}
blockLinId += cur;
}
return blockLinId;
}
inline __host__ __device__ grid_key_dx<dim> BlockInvLinId(mem_id blockLinId) const
inline __host__ __device__ grid_key_dx<dim, int> BlockInvLinId(mem_id blockLinId) const
{
grid_key_dx<dim> blockCoord;
grid_key_dx<dim, int> blockCoord;
for (int d = 0; d < dim; ++d)
{
auto c = blockLinId % blockSz[d];
......
/*
* grid_zmb.hpp
*
* Created on: Aug 1, 2019
* Author: i-bird
*/
#ifndef GRID_ZMB_HPP_
#define GRID_ZMB_HPP_
#include <boost/mpl/size_t.hpp>
#include <cstring>
#include <Grid/grid_sm.hpp>
#include "SparseGridGpu/TemplateUtils/mathUtils.hpp"
#include "util/zmorton.hpp"
/**
* This class provides an interface to linearization of coordinates and viceversa when blocks are involved.
* This can be seen as a lightweight version of grid_sm, with just LinId and InvLinId methods, but
* tuned for blocked data.
*/
template<unsigned int dim, unsigned int blockEdgeSize>
class grid_zmb : public grid_smb<dim,blockEdgeSize>
{
public:
grid_zmb() {}
__host__ __device__ grid_zmb(const size_t (& sz)[dim])
:grid_smb<dim,blockEdgeSize>(sz)
{}
__host__ __device__ grid_zmb(const size_t domainBlockEdgeSize)
:grid_smb<dim,blockEdgeSize>(domainBlockEdgeSize)
{}
template<typename T>
__host__ __device__ grid_zmb(const grid_sm<dim, T> blockGrid)
:grid_smb<dim,blockEdgeSize>(blockGrid)
{}
#ifdef __NVCC__
//Constructors from dim3 and uint3 objects
__host__ __device__ grid_zmb(const dim3 blockDimensions)
:grid_smb<dim,blockEdgeSize>(blockDimensions)
{}
#endif // __NVCC__
__host__ __device__ grid_zmb(const grid_zmb<dim, blockEdgeSize> &other)
:grid_smb<dim,blockEdgeSize>(other)
{}
__host__ __device__ grid_zmb &operator=(const grid_zmb<dim, blockEdgeSize> &other)
{
((grid_smb<dim,blockEdgeSize> *)this)->operator=(other);
return *this;
}
template<typename indexT>
inline __host__ __device__ mem_id LinId(const grid_key_dx<dim, indexT> coord) const
{
//todo: Check (in debug mode only) that the coordinates passed here are valid and not overflowing dimensions (???)
grid_key_dx<dim> key_b;
int localLinId = 0;
int sr = 1;
for (int d = 0 ; d < dim; d++)
{
key_b.set_d(d,coord.get(d) / blockEdgeSize);
localLinId += coord.get(d) % blockEdgeSize * sr;
sr *= blockEdgeSize;
}
return lin_zid(key_b) * this->blockSize + localLinId;
}
inline __host__ __device__ grid_key_dx<dim, int> InvLinId(const mem_id linId) const
{
mem_id linIdB = linId / this->blockSize;
int localLinId = linId % this->blockSize;
return InvLinId(linIdB,localLinId);
}
inline __host__ __device__ grid_key_dx<dim, int> InvLinId(mem_id blockLinId, mem_id localLinId) const
{
grid_key_dx<dim,int> k;
invlin_zid(blockLinId,k);
for (size_t i = 0 ; i < dim ; i++)
{
k.set_d(i,k.get(i)*blockEdgeSize + localLinId % blockEdgeSize);
localLinId /= blockEdgeSize;
}
return k;
}
// Now methods to handle blockGrid coordinates (e.g. to load neighbouring blocks)
template<typename indexT>
inline __host__ __device__ mem_id BlockLinId(const grid_key_dx<dim, indexT> blockCoord) const
{
return lin_zid(blockCoord);
}
inline __host__ __device__ grid_key_dx<dim, int> BlockInvLinId(mem_id blockLinId) const
{
grid_key_dx<dim,int> k;
invlin_zid(blockLinId,k);
return k;
}
};
#endif /* GRID_ZMB_HPP_ */
......@@ -5,71 +5,72 @@
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "../BlockGeometry.hpp"
#include "SparseGridGpu/Geometry/grid_smb.hpp"
template <unsigned int dim, typename BGT>
void testStandardLinearizations(BGT geometry)
{
grid_key_dx<dim> origin({0,0});
grid_key_dx<dim, int> origin({0,0});
BOOST_REQUIRE_EQUAL(geometry.LinId(origin), 0);
grid_key_dx<dim> block0a({7,0});
grid_key_dx<dim, int> block0a({7,0});
BOOST_REQUIRE_EQUAL(geometry.LinId(block0a), 7);
grid_key_dx<dim> block0b({0,1});
grid_key_dx<dim, int> block0b({0,1});
BOOST_REQUIRE_EQUAL(geometry.LinId(block0b), 8);
grid_key_dx<dim> block0c({7,7});
grid_key_dx<dim, int> block0c({7,7});
BOOST_REQUIRE_EQUAL(geometry.LinId(block0c), 63);
grid_key_dx<dim> block1a({8+7,0});
grid_key_dx<dim, int> block1a({8+7,0});
BOOST_REQUIRE_EQUAL(geometry.LinId(block1a), 64+7);
grid_key_dx<dim> block1b({8+0,1});
grid_key_dx<dim, int> block1b({8+0,1});
BOOST_REQUIRE_EQUAL(geometry.LinId(block1b), 64+8);
grid_key_dx<dim> block1c({8+7,7});
grid_key_dx<dim, int> block1c({8+7,7});
BOOST_REQUIRE_EQUAL(geometry.LinId(block1c), 64+63);
grid_key_dx<dim> block3a({7,8+0});
grid_key_dx<dim, int> block3a({7,8+0});
BOOST_REQUIRE_EQUAL(geometry.LinId(block3a), (64*3)+7);
grid_key_dx<dim> block3b({0,8+1});
grid_key_dx<dim, int> block3b({0,8+1});
BOOST_REQUIRE_EQUAL(geometry.LinId(block3b), (64*3)+8);
grid_key_dx<dim> block3c({7,8+7});
grid_key_dx<dim, int> block3c({7,8+7});
BOOST_REQUIRE_EQUAL(geometry.LinId(block3c), (64*3)+63);
}
BOOST_AUTO_TEST_SUITE(BlockGeometry_tests)
BOOST_AUTO_TEST_CASE(testLinId)
{
constexpr unsigned int dim = 2;
const size_t blockGridDim[dim] = {3,7};
BlockGeometry<dim, 8> geometry(blockGridDim);
testStandardLinearizations<dim>(geometry);
}
BOOST_AUTO_TEST_CASE(testCopyConstructor)
{
constexpr unsigned int dim = 2;
const size_t blockGridDim[dim] = {3,7};
BlockGeometry<dim, 8> geometry0(blockGridDim);
// Here copy-construct
BlockGeometry<dim, 8> geometry(geometry0);
// Then test...
testStandardLinearizations<dim>(geometry);
}
BOOST_AUTO_TEST_CASE(testCopyAssignOp)
{
constexpr unsigned int dim = 2;
const size_t blockGridDim[dim] = {3,7};
BlockGeometry<dim, 8> geometry0(blockGridDim);
// Here copy-assign
const size_t blockGridDimOther[dim] = {3,7};
BlockGeometry<dim, 8> geometry(blockGridDimOther);
geometry = geometry0;
// Then test...
testStandardLinearizations<dim>(geometry);
}
BOOST_AUTO_TEST_CASE(testLinId)
{
constexpr unsigned int dim = 2;
const size_t sz[dim] = {3*8,7*8};
grid_smb<dim, 8> geometry(sz);
testStandardLinearizations<dim>(geometry);
}
BOOST_AUTO_TEST_CASE(testCopyConstructor)
{
constexpr unsigned int dim = 2;
const size_t sz[dim] = {3*8,7*8};
grid_smb<dim, 8> geometry0(sz);
// Here copy-construct
grid_smb<dim, 8> geometry(geometry0);
// Then test...
testStandardLinearizations<dim>(geometry);
}
BOOST_AUTO_TEST_CASE(testCopyAssignOp)
{
constexpr unsigned int dim = 2;
const size_t sz[dim] = {3*8,7*8};
grid_smb<dim, 8> geometry0(sz);
// Here copy-assign
grid_smb<dim, 8> geometry(sz);
geometry = geometry0;
// Then test...
testStandardLinearizations<dim>(geometry);
}
BOOST_AUTO_TEST_SUITE_END()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/*
* memcpy_shmem.hpp
*
* Created on: Aug 3, 2019
* Author: i-bird
*/
#ifndef ENCAP_SHMEM_HPP_
#define ENCAP_SHMEM_HPP_
/*! \brief memcpy it split the copy across threads
*
* it assume that the memory to copy is smaller than blockDim.x * 4 byte
*
*/
template<unsigned int sz>
struct encap_shmem
{
static const int size = (sz / 4)*4 + (sz % 4 != 0)*4;
static const int nthr = (sz / 4) + (sz % 4 != 0);
static void copy(int * src, int * dst)
{
if (threadIdx.x < nthr)
{dst[threadIdx.x] = src[threadIdx.x];}
}
};
#endif /* MEMCPY_SHMEM_HPP_ */
//
// Created by tommaso on 28/06/19.
//
#ifndef OPENFPM_PDATA_MATHUTILS_HPP
#define OPENFPM_PDATA_MATHUTILS_HPP
#include <cstdlib>
template<unsigned int base, unsigned int exponent>
struct IntPow
{
constexpr static size_t value = base * IntPow<base, exponent - 1>::value;
};
template<unsigned int base>
struct IntPow<base, 0>
{
constexpr static size_t value = 1;
};
#endif //OPENFPM_PDATA_MATHUTILS_HPP
......@@ -68,6 +68,24 @@ __global__ void insertValuesBlocked(SparseGridType sparseGrid)
sparseGrid.flush_block_insert();
}
template<unsigned int p, typename SparseGridType>
__global__ void insertValuesHalfBlock(SparseGridType sparseGrid)
{
sparseGrid.init();
int pos = blockIdx.x * blockDim.x + threadIdx.x;
constexpr unsigned int dataChunkSize = BlockTypeOf<typename SparseGridType::AggregateType, p>::size;
if (threadIdx.x % dataChunkSize < dataChunkSize/ 2)
{
sparseGrid.template insert<p>(pos) = pos;
}
__syncthreads();
sparseGrid.flush_block_insert();
}
BOOST_AUTO_TEST_SUITE(BlockMapGpu_tests)
BOOST_AUTO_TEST_CASE(testBitwiseOps)
......@@ -130,9 +148,6 @@ BOOST_AUTO_TEST_SUITE(BlockMapGpu_tests)
// Prealloc insert buffer
blockMap.setGPUInsertBuffer(gridSize, bufferPoolSize);
// Initialize the insert buffer
blockMap.initializeGPUInsertBuffer();
// Insert values
insertValues<0> <<< gridSize, blockSizeInsert >>> (blockMap.toKernel());
......@@ -167,6 +182,62 @@ BOOST_AUTO_TEST_SUITE(BlockMapGpu_tests)
BOOST_REQUIRE_EQUAL(match, true);
}
BOOST_AUTO_TEST_CASE(testInsert_halfBlock) //todo
{
typedef aggregate<DataBlock<float, 64>> AggregateT;
typedef aggregate<float> AggregateOutT;
BlockMapGpu<AggregateT, 128> blockMap;
blockMap.template setBackgroundValue<0>(666);
const unsigned int gridSize = 3;
const unsigned int bufferPoolSize = 128; // Should be multiple of BlockT::size
const unsigned int blockSizeInsert = 128;
const unsigned int gridSizeRead = gridSize + 1;
const unsigned int blockSizeRead = 128;
// Prealloc insert buffer
blockMap.setGPUInsertBuffer(gridSize, bufferPoolSize);
// Insert values
insertValuesHalfBlock<0> <<< gridSize, blockSizeInsert >>> (blockMap.toKernel());
// Flush inserts
mgpu::ofp_context_t ctx;
blockMap.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE);
// Get output
openfpm::vector_gpu<AggregateOutT> output;
output.resize(gridSizeRead * blockSizeRead);
copyBlocksToOutput<0> <<< gridSizeRead, blockSizeRead >>> (blockMap.toKernel(), output.toKernel());
output.template deviceToHost<0>();
blockMap.template deviceToHost<0>();
// Compare
bool match = true;
for (size_t i = 0; i < output.size(); i++)
{
auto expectedValue = (i < gridSize * blockSizeInsert) ? i : 666;
constexpr unsigned int dataChunkSize = BlockTypeOf<AggregateT, 0>::size;
int offset = i % dataChunkSize;
if (! (offset < dataChunkSize / 2))
{
expectedValue = 666; // Just the first half of each block was inserted
}
std::cout << "blockMap(" << i << ") = " << blockMap.template get<0>(i)
<< " == "
<< expectedValue
<< " == "
<< output.template get<0>(i) << " = output(" << i << ")"
<< std::endl;
match &= output.template get<0>(i) == blockMap.template get<0>(i);
match &= output.template get<0>(i) == expectedValue;
}
BOOST_REQUIRE_EQUAL(match, true);
}
BOOST_AUTO_TEST_CASE(testInsert_blocked)
{
typedef aggregate<DataBlock<float, 64>> AggregateT;
......@@ -191,9 +262,6 @@ BOOST_AUTO_TEST_SUITE(BlockMapGpu_tests)
// Prealloc insert buffer
sparseGrid.setGPUInsertBuffer(gridSize, bufferPoolSize);
// Initialize the insert buffer
sparseGrid.initializeGPUInsertBuffer();
// Insert values
insertValuesBlocked<0, 2> <<< gridSize, blockSizeInsert >>> (sparseGrid.toKernel());
......
This diff is collapsed.
......@@ -179,7 +179,7 @@ namespace openfpm
* \return the element (encapsulated)
*
*/
inline __device__ auto get(unsigned int id) -> decltype(base.get_o(grid_key_dx<1>(id)))
__device__ inline auto get(unsigned int id) -> decltype(base.get_o(grid_key_dx<1>(id)))
{
#ifdef SE_CLASS1
if (check_bound(id) == false)
......
......@@ -61,7 +61,7 @@ namespace openfpm
// the const is forced by the getter that only return const encap that should not allow the modification of bck
// this should possible avoid to define an object const_encap
mutable vector_gpu_ker<T,layout_base> vct_data_bck;
//mutable vector_gpu_ker<T,layout_base> vct_data_bck;
int nslot_add;
int nslot_rem;
......@@ -72,12 +72,12 @@ namespace openfpm
*
* \param i element i
*/
inline __device__ Ti _branchfree_search(Ti x, Ti & id) const
inline __device__ void _branchfree_search(Ti x, Ti & id) const
{
if (vct_index.size() == 0) {return (Ti)-1;}
if (vct_index.size() == 0) {id = 0; return;}
const Ti *base = &vct_index.template get<0>(0);
const Ti * end = &vct_index.template get<0>(vct_index.size()-1);
Ti n = vct_data.size();
const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size();
Ti n = vct_data.size()-1;
while (n > 1)
{
Ti half = n / 2;
......@@ -87,8 +87,8 @@ namespace openfpm
int off = (*base < x);
id = base - &vct_index.template get<0>(0) + off;
off = off && base != end;
return *(base + off);
Ti v = (base + off != end)?*(base + off):(Ti)-1;
id = (x == v)?id:vct_data.size()-1;
}
public:
......@@ -105,13 +105,12 @@ namespace openfpm
vector_gpu_ker<T,layout_base> vct_add_data,
vector_gpu_ker<aggregate<Ti>,layout_base> vct_nadd_index,
vector_gpu_ker<aggregate<Ti>,layout_base> vct_nrem_index,
vector_gpu_ker<T,layout_base> vct_data_bck,
int nslot_add,
int nslot_rem)
:vct_index(vct_index),vct_data(vct_data),
vct_add_index(vct_add_index),vct_rem_index(vct_rem_index),vct_add_data(vct_add_data),
vct_nadd_index(vct_nadd_index),vct_nrem_index(vct_nrem_index),
vct_data_bck(vct_data_bck),nslot_add(nslot_add),nslot_rem(nslot_rem)
nslot_add(nslot_add),nslot_rem(nslot_rem)
{}
/*! \brief Get the number of elements
......@@ -186,9 +185,8 @@ namespace openfpm
__device__ inline openfpm::sparse_index<Ti> get_sparse(Ti id) const
{
Ti di;
Ti v = this->_branchfree_search(id,di);
openfpm::sparse_index<Ti> sid((v == id)?di:(Ti)-1);
this->_branchfree_search(id,di);
openfpm::sparse_index<Ti> sid(di);
return sid;
}
......@@ -196,9 +194,9 @@ namespace openfpm
/*! \brief Get the background value
*/
template <unsigned int p>
__device__ inline auto getBackground() -> decltype(vct_data_bck.template get<p>(0))
__device__ inline auto getBackground() const -> decltype(vct_data.template get<p>(0)) &
{
return vct_data_bck.template get<p>(0);
return vct_data.template get<p>(vct_data.size()-1);
}
/*! \brief Get an element of the vector
......@@ -215,20 +213,15 @@ namespace openfpm
__device__ inline auto get(Ti id) const -> decltype(vct_data.template get<p>(id))
{
Ti di;
Ti v = this->_branchfree_search(id,di);
return (v == id)?vct_data.template get<p>(di):vct_data_bck.template get<p>(0);
this->_branchfree_search(id,di);
return vct_data.template get<p>(di);
}
__device__ inline auto get(Ti id) const -> decltype(vct_data.get(0))
{
Ti di;
Ti v = this->_branchfree_search(id,di);
auto ec = vct_data.get_unsafe(static_cast<size_t>(di));
if (v != id)
{
ec = vct_data_bck.get(0);
}
return ec;
return vct_data.get(static_cast<size_t>(di));
}
/*! \brief Get an element of the vector
......@@ -287,9 +280,8 @@ namespace openfpm
template <unsigned int p>
__device__ inline auto get(Ti id, Ti & di) const -> decltype(vct_data.template get<p>(id))
{
Ti v = this->_branchfree_search(id,di);
di = (v != id)?-1:di;
return (v == id)?vct_data.template get<p>(di):vct_data_bck.template get<p>(0);
this->_branchfree_search(id,di);
return vct_data.template get<p>(di);
}
/*! \brief Get an element of the vector
......@@ -305,7 +297,7 @@ namespace openfpm
template <unsigned int p>
__device__ inline auto get_ele(Ti di) const -> decltype(vct_data.template get<p>(di))
{
return (di != -1)?vct_data.template get<p>(di):vct_data_bck.template get<p>(0);
return vct_data.template get<p>(di);
}
/*! \brief It insert an element in the sparse vector
......@@ -568,14 +560,6 @@ namespace openfpm
return pc;
}
pc = vct_data_bck.check_device_pointer(ptr);
if (pc.match == true)
{
pc.match_str = std::string("Background data vector overflow: ") + "\n" + pc.match_str;
return pc;
}
return pc;
}
......
......@@ -249,6 +249,9 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu )
match &= output.template get<0>(i) == 17;
match &= output.template get<1>(i) == 18;
match &= output.template get<2>(i) == 19;
if (match == false){break;}
}
BOOST_REQUIRE_EQUAL(match,true);
......@@ -720,8 +723,6 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove_incremental )
vs.template deviceToHost<0,1,2>();
BOOST_REQUIRE_EQUAL(vs.template get<0>(7022),14934);
BOOST_REQUIRE_EQUAL(vs.template get<0>(7020),14940);
BOOST_REQUIRE_EQUAL(vs.template get<0>(7020),14940);
BOOST_REQUIRE_EQUAL(vs.template get<0>(7018),14946);
BOOST_REQUIRE_EQUAL(vs.template get<0>(7016),14952);
......@@ -733,8 +734,6 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove_incremental )
BOOST_REQUIRE_EQUAL(vs.template get<0>(7004),14988);
BOOST_REQUIRE_EQUAL(vs.template get<0>(7002),14994);
BOOST_REQUIRE_EQUAL(vs.template get<1>(7022),44934);
BOOST_REQUIRE_EQUAL(vs.template get<1>(7020),44940);
BOOST_REQUIRE_EQUAL(vs.template get<1>(7020),44940);
BOOST_REQUIRE_EQUAL(vs.template get<1>(7018),44946);
BOOST_REQUIRE_EQUAL(vs.template get<1>(7016),44952);
......@@ -746,8 +745,6 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove_incremental )
BOOST_REQUIRE_EQUAL(vs.template get<1>(7004),44988);
BOOST_REQUIRE_EQUAL(vs.template get<1>(7002),44994);
BOOST_REQUIRE_EQUAL(vs.template get<2>(7022),74934);
BOOST_REQUIRE_EQUAL(vs.template get<2>(7020),74940);
BOOST_REQUIRE_EQUAL(vs.template get<2>(7020),74940);
BOOST_REQUIRE_EQUAL(vs.template get<2>(7018),74946);
BOOST_REQUIRE_EQUAL(vs.template get<2>(7016),74952);
......
......@@ -16,6 +16,68 @@
#endif
template<typename type_t>
struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> {
__device__ __host__ type_t operator()(type_t a, type_t b) const {
return b;
}
};
template<unsigned int prp>
struct sRight_
{
typedef boost::mpl::int_<prp> prop;
template<typename red_type> using op_red = rightOperand_t<red_type>;
template<typename red_type>
__device__ __host__ static red_type red(red_type & r1, red_type & r2)
{
return r2;
}
static bool is_special()
{
return false;
}
//! is not special reduction so it does not need it
template<typename seg_type, typename output_type>
__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
{}
};
template<typename type_t>
struct leftOperand_t : public std::binary_function<type_t, type_t, type_t> {
__device__ __host__ type_t operator()(type_t a, type_t b) const {
return a;
}
};
template<unsigned int prp>
struct sLeft_
{
typedef boost::mpl::int_<prp> prop;
template<typename red_type> using op_red = leftOperand_t<red_type>;
template<typename red_type>
__device__ __host__ static red_type red(red_type & r1, red_type & r2)
{
return r1;
}
static bool is_special()
{
return false;
}
//! is not special reduction so it does not need it
template<typename seg_type, typename output_type>
__device__ __host__ static void set(seg_type seg_next, seg_type seg_prev, output_type & output, int i)
{}
};
template<unsigned int prp>
struct sadd_
{
......@@ -45,7 +107,7 @@ struct sadd_
template<typename type_t, unsigned int blockLength>
struct plus_block_t : public std::binary_function<type_t, type_t, type_t> {
MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
__device__ __host__ type_t operator()(type_t a, type_t b) const {
type_t res;
for (int i=0; i<blockLength; ++i)
{
......@@ -430,6 +492,7 @@ __global__ void construct_remove_list(vector_index_type vit_block_data,
}
}
template<typename e_type, typename v_reduce>
struct data_merger
{
......
......@@ -380,7 +380,7 @@ namespace openfpm
size_t gr = slot;
// If you increase by one we smartly resize the internal capacity more than 1
// This is to make faster patterns like resize(size()+1)
if (slot - base.size() == 1)
if (slot - base.size() == 1 && opt && (opt & EXACT_RESIZE) == 0)
{
gr = grow_p::grow(base.size(),slot);
}
......
This diff is collapsed.
......@@ -40,6 +40,31 @@
}\
}
#define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
{\
cudaDeviceSynchronize(); \
{\
cudaError_t e = cudaGetLastError();\
if (e != cudaSuccess)\
{\
std::string error = cudaGetErrorString(e);\
std::cout << "Cuda an error has occurred before this CUDA_LAUNCH, detected in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
}\
}\
CHECK_SE_CLASS1_PRE\
cuda_call<<<wthr,thr>>>(__VA_ARGS__); \
cudaDeviceSynchronize(); \
{\
cudaError_t e = cudaGetLastError();\
if (e != cudaSuccess)\
{\
std::string error = cudaGetErrorString(e);\
std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
}\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}\
}
#define CUDA_CHECK() \
{\
cudaDeviceSynchronize(); \
......@@ -69,6 +94,8 @@
#define CUDA_LAUNCH(cuda_call,ite, ...) \
cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);
#define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
cuda_call<<<wthr,thr>>>(__VA_ARGS__);
#define CUDA_CHECK()
......
This diff is collapsed.
/*
* zmorton.hpp
*
* Created on: Jul 31, 2019
* Author: i-bird
*/
#ifndef ZMORTON_HPP_
#define ZMORTON_HPP_
#include "Grid/grid_key.hpp"
template<typename T>
inline __device__ __host__ size_t lin_zid(const grid_key_dx<1,T> & key)
{
return key.get(0);
}
template<typename T>
inline __device__ __host__ void invlin_zid(size_t lin, grid_key_dx<1,T> & key)
{
return key.set_d(0,lin);
}
template<typename T>
inline __device__ __host__ size_t lin_zid(const grid_key_dx<2,T> & key)
{
size_t x = key.get(0);
size_t y = key.get(1);
x = (x | (x << 16)) & 0x0000FFFF0000FFFF;
x = (x | (x << 8)) & 0x00FF00FF00FF00FF;
x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0F;
x = (x | (x << 2)) & 0x3333333333333333;
x = (x | (x << 1)) & 0x5555555555555555;
y = (y | (y << 16)) & 0x0000FFFF0000FFFF;
y = (y | (y << 8)) & 0x00FF00FF00FF00FF;
y = (y | (y << 4)) & 0x0F0F0F0F0F0F0F0F;
y = (y | (y << 2)) & 0x3333333333333333;
y = (y | (y << 1)) & 0x5555555555555555;
return x | (y << 1);
}
template<typename T>
inline __device__ __host__ void invlin_zid(size_t lin, grid_key_dx<2,T> & key)
{
size_t x = lin & 0x5555555555555555;
size_t y = (lin & 0xAAAAAAAAAAAAAAAA) >> 1;
x = (x | (x >> 1)) & 0x3333333333333333;
x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F;
x = (x | (x >> 4)) & 0x00FF00FF00FF00FF;
x = (x | (x >> 8)) & 0x0000FFFF0000FFFF;
x = (x | (x >> 16)) & 0x00000000FFFFFFFF;
y = (y | (y >> 1)) & 0x3333333333333333;
y = (y | (y >> 2)) & 0x0F0F0F0F0F0F0F0F;
y = (y | (y >> 4)) & 0x00FF00FF00FF00FF;
y = (y | (y >> 8)) & 0x0000FFFF0000FFFF;
y = (y | (y >> 16)) & 0x00000000FFFFFFFF;
key.set_d(0,x);
key.set_d(1,y);
}
static const size_t B3[] = {0x9249249249249249, 0x30C30C30C30C30C3, 0xF00F00F00F00F00F, 0x0FFF000FFF000FFF,0xFFFF0000FFFFFFFF};
static const size_t S3[] = {2, 4, 8, 16, 32};
template<typename T>
inline __device__ __host__ size_t lin_zid(const grid_key_dx<3,T> & key)
{
size_t x = key.get(0);
size_t z = key.get(2);
size_t y = key.get(1);
x = (x | (x << S3[4])) & B3[4];
x = (x | (x << S3[3])) & B3[3];
x = (x | (x << S3[2])) & B3[2];
x = (x | (x << S3[1])) & B3[1];
x = (x | (x << S3[0])) & B3[0];
y = (y | (y << S3[4])) & B3[4];
y = (y | (y << S3[3])) & B3[3];
y = (y | (y << S3[2])) & B3[2];
y = (y | (y << S3[1])) & B3[1];
y = (y | (y << S3[0])) & B3[0];
z = (z | (z << S3[4])) & B3[4];
z = (y | (y << S3[3])) & B3[3];
z = (y | (y << S3[2])) & B3[2];
z = (y | (y << S3[1])) & B3[1];
z = (y | (y << S3[0])) & B3[0];
return x | (y << 1) | (z << 2);
}
template<typename T>
inline __device__ __host__ void invlin_zid(size_t lin, grid_key_dx<3,T> & key)
{
size_t x = lin & 0x9249249249249249;
size_t y = (lin >> 1) & 0x9249249249249249;
size_t z = (lin >> 2) & 0x9249249249249249;
x = (x | (x >> 2)) & 0x30C30C30C30C30C3;
x = (x | (x >> 4)) & 0xF00F00F00F00F00F;
x = (x | (x >> 8)) & 0x00FF0000FF0000FF;
x = (x | (x >> 16)) & 0x00000FF0000FFFF;
x = (x | x >> 16) & 0xFFFFFF;
y = (y | (y >> 2)) & 0x30C30C30C30C30C3;
y = (y | (y >> 4)) & 0xF00F00F00F00F00F;
y = (y | (y >> 8)) & 0x00FF0000FF0000FF;
y = (y | (y >> 16)) & 0x00000FF0000FFFF;
y = (y | y >> 16) & 0xFFFFFF;
z = (z | (z >> 2)) & 0x30C30C30C30C30C3;
z = (z | (z >> 4)) & 0xF00F00F00F00F00F;
z = (z | (z >> 8)) & 0x00FF0000FF0000FF;
z = (z | (z >> 16)) & 0x00000FF0000FFFF;
z = (z | z >> 16) & 0xFFFFFF;
key.set_d(0,x);
key.set_d(1,y);
key.set_d(2,z);
}
#endif /* ZMORTON_HPP_ */