Commit f4f307e3 authored by incardon's avatar incardon

Testing working, but flush must be revised, testHeatStencilInsert is unclear

parent 0f8d618b
......@@ -52,6 +52,10 @@ if(CUDA_FOUND)
set(DEFINE_CUDA_GPU "#define CUDA_GPU")
endif()
if(TEST_PERFORMANCE)
set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST")
endif()
if (Boost_FOUND)
set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
......
......@@ -229,6 +229,12 @@ do
test_coverage)
conf_options="$conf_options -DTEST_COVERAGE=ON"
;;
scan_coverty)
conf_options="$conf_options -DSCAN_COVERTY=ON"
;;
test_performance)
conf_options="$conf_options -DTEST_PERFORMANCE=ON"
;;
gpu)
if [ x"$CXX" == x"" ]; then
conf_options="$conf_options"
......@@ -469,6 +475,9 @@ do
boost)
conf_options="$conf_options -DBOOST_ROOT=$ac_optarg"
;;
action_on_error)
conf_options="$conf_options -DACTION_ON_ERROR=$ac_optarg"
;;
mpivendor)
conf_options="$conf_options -DMPI_VENDOR=$ac_optarg"
;;
......
......@@ -2,8 +2,12 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if (TEST_PERFORMANCE)
set(CUDA_SOURCES SparseGridGpu/performance/Stencil_performance_tests.cu)
endif ()
if (CUDA_FOUND)
set(CUDA_SOURCES
set(CUDA_SOURCES ${CUDA_SOURCES}
Vector/vector_gpu_unit_tests.cu
Grid/cuda/cuda_grid_gpu_tests.cu
Vector/cuda/map_vector_cuda_funcs_tests.cu
......@@ -14,12 +18,12 @@ if (CUDA_FOUND)
Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
NN/CellList/tests/CellDecomposer_gpu_ker_unit_test.cu
SparseGridGpu/BlockMapGpu.hpp
SparseGridGpu/BlockMapGpu_ker.cuh
SparseGridGpu/tests/BlockMapGpu_tests.cu
util/test/zmorton_unit_tests.cpp
SparseGridGpu/DataBlock.cuh SparseGridGpu/BlockMapGpu_kernels.cuh
util/cuda/segreduce_block_cuda.cuh util/cuda/test/segreduce_block_cuda_tests.cu SparseGridGpu/BlockMapGpu_dimensionalityWrappers.cuh SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu SparseGridGpu/SparseGridGpu.hpp SparseGridGpu/tests/SparseGridGpu_tests.cu SparseGridGpu/SparseGridGpu_ker.cuh SparseGridGpu/SparseGridGpu_kernels.cuh SparseGridGpu/BlockCache/BlockCache.cuh SparseGridGpu/tests/Stencil_performance_tests.cu )
util/cuda/test/segreduce_block_cuda_tests.cu
SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu
SparseGridGpu/tests/SparseGridGpu_tests.cu
)
else ()
set(CUDA_SOURCES)
endif ()
......@@ -37,6 +41,13 @@ add_executable(mem_map ../../openfpm_devices/src/Memleak_check.cpp
../../openfpm_devices/src/memory/PtrMemory.cpp
SparseGridGpu/Geometry/tests/grid_smb_tests.cpp)
add_executable(isolation
../../openfpm_devices/src/memory/CudaMemory.cu
../../openfpm_devices/src/memory/HeapMemory.cpp
SparseGridGpu/tests/SparseGridGpu_tests.cu
isolation.cpp
)
if (CMAKE_COMPILER_IS_GNUCC)
target_compile_options(mem_map PRIVATE "-Wno-deprecated-declarations")
if (TEST_COVERAGE)
......@@ -48,6 +59,7 @@ endif ()
if (TEST_PERFORMANCE)
target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_io/src/)
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_io/src/)
endif ()
if (CUDA_FOUND)
......@@ -58,6 +70,13 @@ if (CUDA_FOUND)
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
target_compile_options(isolation PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >)
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
endif ()
target_include_directories(mem_map PUBLIC ${CUDA_INCLUDE_DIRS})
......@@ -67,9 +86,20 @@ target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
target_include_directories(mem_map PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
target_include_directories(mem_map PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories(isolation PUBLIC ${CUDA_INCLUDE_DIRS})
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_devices/src/)
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
target_include_directories(isolation PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
target_include_directories(isolation PUBLIC ${Boost_INCLUDE_DIRS})
target_link_libraries(mem_map ${Boost_LIBRARIES})
target_link_libraries(mem_map -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
target_link_libraries(isolation ${Boost_LIBRARIES})
target_link_libraries(isolation -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
if (TEST_COVERAGE)
target_link_libraries(mem_map -lgcov)
endif ()
......
......@@ -24,7 +24,8 @@ public:
* \param exp grid_key_dx expression
*
*/
template<typename exp1> inline grid_key_dx(const grid_key_dx_expression<dim,exp1> & exp)
template<typename exp1>
__device__ __host__ inline grid_key_dx(const grid_key_dx_expression<dim,exp1> & exp)
{
for (size_t i = 0 ; i < dim ; i++)
this->k[i] = exp.value(i);
......@@ -243,9 +244,10 @@ public:
* \return a grid_key_dx_expression that encapsulate the expression
*
*/
inline grid_key_dx_sum<dim,grid_key_dx<dim>,grid_key_dx<dim>> operator+(const grid_key_dx<dim> & p) const
__device__ __host__ inline grid_key_dx_sum<dim,grid_key_dx<dim,index_type>,grid_key_dx<dim,index_type>>
operator+(const grid_key_dx<dim,index_type> & p) const
{
grid_key_dx_sum<dim,grid_key_dx<dim>,grid_key_dx<dim>> exp_sum(*this,p);
grid_key_dx_sum<dim,grid_key_dx<dim,index_type>,grid_key_dx<dim,index_type>> exp_sum(*this,p);
return exp_sum;
}
......@@ -257,7 +259,8 @@ public:
* \return a grid_key_dx_expression that encapsulate the expression
*
*/
inline grid_key_dx_sum<dim,grid_key_dx<dim>,Point<dim,long int>> operator+(const Point<dim,long int> & p) const
inline grid_key_dx_sum<dim,grid_key_dx<dim>,Point<dim,long int>>
operator+(const Point<dim,long int> & p) const
{
grid_key_dx_sum<dim,grid_key_dx<dim>,Point<dim,long int>> exp_sum(*this,p);
......
......@@ -19,10 +19,6 @@ struct report_grid_copy_func_tests
report_grid_copy_func_tests report_grid_funcs;
//openfpm::vector<std::string> testsg;
//openfpm::vector<float> per_timesg;
BOOST_AUTO_TEST_SUITE( grid_performance )
BOOST_AUTO_TEST_CASE(grid_performance_set_obj)
......
......@@ -115,7 +115,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
vector_cnt_type starts;
//! \brief sparse vector in case of sparse Cell-list
openfpm::vector_sparse_u_gpu<aggregate<cnt_type>> cl_sparse;
openfpm::vector_sparse_gpu<aggregate<cnt_type>> cl_sparse;
//! \brief number of neighborhood each cell cell has + offset
openfpm::vector_gpu<aggregate<cnt_type>> cells_nn;
......
......@@ -596,7 +596,7 @@ class CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,true>: public CellDecom
openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt;
//! Set of cells sparse
openfpm::vector_sparse_gpu_ker<aggregate<cnt_type>,cnt_type,memory_traits_inte> cl_sparse;
openfpm::vector_sparse_gpu_ker<aggregate<cnt_type>,int,memory_traits_inte> cl_sparse;
//! Ghost particle marker
unsigned int g_m;
......@@ -608,7 +608,7 @@ public:
__device__ inline CellList_gpu_ker(openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> cell_nn,
openfpm::vector_gpu_ker<aggregate<cnt_type,cnt_type>,memory_traits_inte> cell_nn_list,
openfpm::vector_sparse_gpu_ker<aggregate<cnt_type>,cnt_type,memory_traits_inte> cl_sparse,
openfpm::vector_sparse_gpu_ker<aggregate<cnt_type>,int,memory_traits_inte> cl_sparse,
openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> srt,
openfpm::vector_gpu_ker<aggregate<cnt_type>,memory_traits_inte> dprt,
openfpm::array<T,dim,cnt_type> & spacing_c,
......
......@@ -24,7 +24,7 @@ using BlockTypeOf = typename std::remove_reference<typename boost::fusion::resul
template<typename AggregateT, unsigned int p>
using ScalarTypeOf = typename std::remove_reference<typename boost::fusion::result_of::at_c<typename AggregateT::type, p>::type>::type::scalarType;
template<typename AggregateBlockT, unsigned int threadBlockSize=128, typename indexT=int, template<typename> class layout_base=memory_traits_inte>
template<typename AggregateBlockT, unsigned int threadBlockSize=128, typename indexT=long int, template<typename> class layout_base=memory_traits_inte>
class BlockMapGpu
{
private:
......@@ -34,9 +34,8 @@ protected:
const static unsigned char EXIST_BIT = 0;
typedef typename AggregateAppend<DataBlock<unsigned char, BlockT0::size>, AggregateBlockT>::type AggregateInternalT;
static const unsigned int pMask = AggregateInternalT::max_prop_real - 1;
openfpm::vector_sparse_gpu<
openfpm::vector_sparse_gpu_block<
AggregateInternalT,
openfpm::VECTOR_SPARSE_BLOCK,
BlockMapGpuFunctors::BlockFunctor<threadBlockSize>
> blockMap;
......@@ -127,6 +126,19 @@ public:
{
unsetBit(bitMask, EXIST_BIT);
}
/*! \brief Return internal structure block map
*
* \return the blockMap
*
*/
openfpm::vector_sparse_gpu_block<
AggregateInternalT,
BlockMapGpuFunctors::BlockFunctor<threadBlockSize>
> & private_get_blockMap()
{
return blockMap;
}
};
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
......
......@@ -131,12 +131,12 @@ public:
#endif // __NVCC__
}
inline __device__ auto insertBlockNew(unsigned int blockId, unsigned int stride = 8192) -> decltype(blockMap.insert(0))
inline __device__ auto insertBlockNew(indexT blockId, unsigned int stride = 8192) -> decltype(blockMap.insert(0))
{
__shared__ int mem[encap_shmem<sizeof(blockMap.insert(0))>::nthr];
#ifdef __NVCC__
if (threadIdx.x % stride == 0)
if (threadIdx.x % stride == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{
auto ec = blockMap.insert(blockId);
......@@ -210,6 +210,31 @@ public:
{
return get<pMask>(linId);
}
#ifdef SE_CLASS1
/*! \brief Check if the device pointer is owned by this structure
*
* \return a structure pointer check with information about the match
*
*/
pointer_check check_device_pointer(void * ptr)
{
pointer_check pc;
pc = blockMap.check_device_pointer(ptr);
if (pc.match == true)
{
pc.match_str = std::string("blockMap overflow : ") + "\n" + pc.match_str;
return pc;
}
return pc;
}
#endif
};
template<typename AggregateBlockT, typename indexT, template<typename> class layout_base>
......
......@@ -185,6 +185,130 @@ namespace BlockMapGpuKernels
#endif // __NVCC__
}
// GridSize = number of segments
// BlockSize = chunksPerBlock * chunkSize
//
/* template<unsigned int p,
unsigned int pSegment,
unsigned int pMask,
unsigned int chunksPerBlock,
typename op,
typename IndexVectorT, typename DataVectorT, typename MaskVectorT>
__global__ void
segreduce_total(
DataVectorT data,
DataVectorT data_old,
IndexVectorT segments_data,
IndexVectorT segments_dataMap,
IndexVectorT outputMap,
MaskVectorT masks,
DataVectorT output
)
{
#ifdef __NVCC__
typedef typename DataVectorT::value_type AggregateT;
typedef BlockTypeOf<AggregateT, p> DataType;
typedef BlockTypeOf<AggregateT, pMask> MaskType;
typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
constexpr unsigned int chunkSize = BaseBlockType::size;
unsigned int segmentId = blockIdx.x;
int segmentSize = segments.template get<pSegment>(segmentId + 1)
- segments.template get<pSegment>(segmentId);
unsigned int start = segments.template get<pSegment>(segmentId);
unsigned int chunkId = threadIdx.x / chunkSize;
unsigned int offset = threadIdx.x % chunkSize;
__shared__ ArrayWrapper<DataType> A[chunksPerBlock];
__shared__ MaskType AMask[chunksPerBlock];
typename ComposeArrayType<DataType>::type bReg;
typename MaskType::scalarType aMask, bMask;
// Phase 0: Load chunks as first operand of the reduction
if (chunkId < segmentSize)
{
unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
A[chunkId][offset] = RhsBlockWrapper<DataType>(data.template get<p>(m_chunkId), offset).value;
aMask = masks.template get<pMask>(m_chunkId)[offset];
}
int i = chunksPerBlock;
for (; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
{
unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
data.template get<p>(m_chunkId),
offset);
bMask = masks.template get<pMask>(m_chunkId)[offset];
generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
bReg,
BlockMapGpu_ker<>::exist(aMask),
BlockMapGpu_ker<>::exist(bMask));
aMask = aMask | bMask;
}
if (i + chunkId < segmentSize)
{
unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
data.template get<p>(m_chunkId),
offset);
bMask = masks.template get<pMask>(m_chunkId)[offset];
generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
bReg,
BlockMapGpu_ker<>::exist(aMask),
BlockMapGpu_ker<>::exist(bMask));
aMask = aMask | bMask;
}
AMask[chunkId][offset] = aMask;
__syncthreads();
// Horizontal reduction finished
// Now vertical reduction
for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
{
if (chunkId % j == 0 && chunkId < segmentSize)
{
unsigned int otherChunkId = chunkId + (j / 2);
if (otherChunkId < segmentSize)
{
aMask = AMask[chunkId][offset];
bMask = AMask[otherChunkId][offset];
generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
A[otherChunkId][offset],
BlockMapGpu_ker<>::exist(aMask),
BlockMapGpu_ker<>::exist(bMask));
AMask[chunkId][offset] = aMask | bMask;
}
}
__syncthreads();
}
//////////////////////////////////////// Reduce now with old data if present link
///////////////////////////////////////////////////////////////////////////////////
// Write output
if (chunkId == 0)
{
unsigned int out_id = outputMap.template get<0>(segmentId);
generalDimensionFunctor<DataType>::assignWithOffset(output.template get<p>(out_id), A[chunkId].data,
offset);
}
#else // __NVCC__
std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
#endif // __NVCC__
}*/
/**
* Compact various memory pools (with empty segments) into a single contiguous memory region.
* NOTE: Each thread block is in charge of one pool
......@@ -558,6 +682,24 @@ namespace BlockMapGpuFunctors
svr(dataOut,tmpData,mergeIndices,context);
boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
///////////////////// DEBUG /////////////////
/* dataOut.template deviceToHost<0>();
std::cout << "DATA CHUNKS OUT: " << dataOut.size() << std::endl;
for (size_t i = 0 ; i < dataOut.size() ; i++)
{
for (size_t j = 0 ; j < 64 ; j++)
{
std::cout << dataOut.template get<0>(i)[j] << " ";
}
std::cout << std::endl;
}*/
//////////////////////////////////////////////
return true; //todo: check if error in kernel
#else // __NVCC__
std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
......
......@@ -158,9 +158,8 @@ public:
{
mem_id blockLinId = blockCoord.get(dim - 1);
if (blockLinId >= blockSz[dim-1])
{
return -1;
}
{return -1;}
for (int d = dim - 2; d >= 0; --d)
{
blockLinId *= blockSz[d];
......@@ -174,6 +173,20 @@ public:
return blockLinId;
}
// Now methods to handle blockGrid coordinates (e.g. to load neighbouring blocks)
template<typename indexT>
inline __host__ __device__ grid_key_dx<dim,indexT> getGlobalCoord(const grid_key_dx<dim, indexT> blockCoord, unsigned int offset) const
{
grid_key_dx<dim,indexT> k;
for (unsigned int i = 0 ; i < dim ; i++)
{
k.set_d(i,blockCoord.get(i)*blockEdgeSize + offset%blockEdgeSize);
offset /= blockEdgeSize;
}
return k;
}
inline __host__ __device__ grid_key_dx<dim, int> BlockInvLinId(mem_id blockLinId) const
{
grid_key_dx<dim, int> blockCoord;
......@@ -195,6 +208,11 @@ public:
return sz;
}
inline size_t getBlockSize() const
{
return blockSize;
}
};
#endif //OPENFPM_PDATA_BLOCKGEOMETRY_HPP
......@@ -13,6 +13,9 @@
#include "SparseGridGpu_kernels.cuh"
#include "Iterators/SparseGridGpu_iterator_sub.hpp"
#include "Geometry/grid_zmb.hpp"
#ifdef OPENFPM_DATA_ENABLE_IO_MODULE
#include "VTKWriter/VTKWriter.hpp"
#endif
// todo: Move all the following utils into some proper file inside TemplateUtils
......@@ -58,6 +61,17 @@ struct aggregate_convert<dim,blockEdgeSize,aggregate<types ...>>
typedef typename aggregate_transform_datablock_impl<dim,blockEdgeSize,types ...>::type type;
};
template<typename aggr>
struct aggregate_add
{
};
template<typename ... types>
struct aggregate_add<aggregate<types ...>>
{
typedef aggregate<types ..., unsigned char> type;
};
/////////////
enum StencilMode
......@@ -83,11 +97,42 @@ struct ct_par
static const unsigned int nLoop = nLoop_;
};
template<typename Tsrc,typename Tdst>
class copy_prop_to_vector_block
{
//! source
Tsrc src;
//! destination
Tdst dst;
size_t pos;
unsigned int bPos;
public:
copy_prop_to_vector_block(Tsrc src, Tdst dst,size_t pos, size_t bPos)
:src(src),dst(dst),pos(pos),bPos(bPos)
{}
//! It call the copy function for each property
template<typename T>
inline void operator()(T& t) const
{
typedef typename std::remove_reference<decltype(dst.template get<T::value>())>::type copy_rtype;
meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>()[bPos],dst.template get<T::value>());
}
};
template<unsigned int dim,
typename AggregateT,
unsigned int blockEdgeSize = default_edge<dim>::type::value,
unsigned int threadBlockSize = 128,
typename indexT=int,
typename indexT=long int,
template<typename> class layout_base=memory_traits_inte,
typename linearizer = grid_smb<dim, blockEdgeSize>>
class SparseGridGpu : public BlockMapGpu<
......@@ -110,7 +155,7 @@ private:
//! index (in a star like 0 mean negative x 1 positive x, 1 mean negative y and so on)
openfpm::vector_gpu<aggregate<short int,short int>> ghostLayerToThreadsMapping;
openfpm::vector_gpu<aggregate<unsigned int>> nn_blocks;
openfpm::vector_gpu<aggregate<indexT>> nn_blocks;
protected:
static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
......@@ -302,11 +347,11 @@ private:
? numScalars / localThreadBlockSize
: 1 + numScalars / localThreadBlockSize;
SparseGridGpuKernels::applyStencilInPlace
CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
<dim,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
stencil>
<<<threadGridSize, localThreadBlockSize>>>(
stencil>),
threadGridSize, localThreadBlockSize,
indexBuffer.toKernel(),
dataBuffer.toKernel(),
this->template toKernelNN<stencil::stencil_type::nNN>(),
......@@ -336,11 +381,11 @@ private:
setGPUInsertBuffer(threadGridSize, chunksPerBlock);
// setGPUInsertBuffer(threadGridSize, localThreadBlockSize);
SparseGridGpuKernels::applyStencilInsert
CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInsert
<dim,
BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
stencil>
<<<threadGridSize, localThreadBlockSize>>>(
stencil>),
threadGridSize, localThreadBlockSize,
indexBuffer.toKernel(),
dataBuffer.toKernel(),
this->template toKernelNN<stencil::stencil_type::nNN>(),
......@@ -550,9 +595,8 @@ public:
// NOTE: Here we want to work only on one data chunk per block!
unsigned int localThreadBlockSize = threadBlockSize % NNtype::nNN == 0
? threadBlockSize
: (threadBlockSize / NNtype::nNN) * NNtype::nNN; //todo: check that this is properly rounding
unsigned int localThreadBlockSize = NNtype::nNN;
unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
? numScalars / localThreadBlockSize
: 1 + numScalars / localThreadBlockSize;
......@@ -606,13 +650,105 @@ public:
{
unsetBit(bitMask, PADDING_BIT);
}
template<unsigned int p>
void print_vct_add_data()
{
typedef BlockMapGpu<
typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
threadBlockSize, indexT, layout_base> BMG;
auto & bM = BMG::blockMap.private_get_vct_add_data();
auto & vI = BMG::blockMap.private_get_vct_add_index();
bM.template deviceToHost<p>();
vI.template deviceToHost<0>();
std::cout << "vct_add_data: " << std::endl;
for (size_t i = 0 ; i < bM.size() ; i++)
{
std::cout << i << " index: " << vI.template get<0>(i) << " BlockData: " << std::endl;
for (size_t j = 0 ; j < blockSize ; j++)
{
std::cout << (int)bM.template get<p>(i)[j] << " ";
}
std::cout << std::endl;
}
}
#ifdef OPENFPM_DATA_ENABLE_IO_MODULE
/*! \brief write the sparse grid into VTK
*
* \param out VTK output
*
*/
template<typename Tw = float> bool write(const std::string & output)
{
file_type ft = file_type::BINARY;
auto & bm = this->private_get_blockMap();
auto & index = bm.getIndexBuffer();
auto & data = bm.getDataBuffer();
openfpm::vector<Point<dim,Tw>> tmp_pos;
openfpm::vector<typename aggregate_add<AggregateT>::type> tmp_prp;
// copy position and properties
auto it = index.getIterator();
while(it.isNext())
{
auto key = it.get();
Point<dim,Tw> p;
for (size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)
{
if (data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i] != 0)
{
// Get the block index
grid_key_dx<dim,int> keyg = gridGeometry.InvLinId(index.template get<0>(key),i);
for (size_t k = 0 ; k < dim ; k++)
{p.get(k) = keyg.get(k);}
tmp_pos.add(p);
tmp_prp.add();
copy_prop_to_vector_block<decltype(data.get_o(key)),decltype(tmp_prp.last())>
cp(data.get_o(key),tmp_prp.last(),key,i);
boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
}
}
++it;
}
// VTKWriter for a set of points
VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim,Tw>>, openfpm::vector<typename aggregate_add<AggregateT>::type>>, VECTOR_POINTS> vtk_writer;
vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.size());
openfpm::vector<std::string> prp_names;
// Write the VTK file
return vtk_writer.write(output,prp_names,"sparse_grid","",ft);
}
#endif
};
template<unsigned int dim,
typename AggregateT,
unsigned int blockEdgeSize = default_edge<dim>::type::value,
unsigned int threadBlockSize = 128,
typename indexT=int,
typename indexT=long int,
template<typename> class layout_base=memory_traits_inte,
typename linearizer = grid_zmb<dim, blockEdgeSize>>
using SparseGridGpu_z = SparseGridGpu<dim,AggregateT,blockEdgeSize,threadBlockSize,indexT,layout_base,linearizer>;
......
......@@ -29,20 +29,24 @@ protected:
static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
unsigned int ghostLayerSize;
openfpm::vector_gpu_ker<aggregate<short int,short int>,memory_traits_inte> ghostLayerToThreadsMapping;
openfpm::vector_gpu_ker<aggregate<unsigned int>,memory_traits_inte> nn_blocks;
openfpm::vector_gpu_ker<aggregate<indexT>,memory_traits_inte> nn_blocks;
public:
static constexpr unsigned int d = dim;
static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
unsigned int stencilSupportRadius;
typedef AggregateBlockT AggregateBlockType;
//! Indicate this structure has a function to check the device pointer
typedef int yes_has_check_device_pointer;
public:
SparseGridGpu_ker(const openfpm::vector_sparse_gpu_ker<AggregateBlockT, indexT, layout_base> &blockMap,
linearizer & grid,
GridSmT extendedBlockGeometry,
unsigned int stencilSupportRadius,
openfpm::vector_gpu_ker<aggregate<short int,short int>,memory_traits_inte> ghostLayerToThreadsMapping,
openfpm::vector_gpu_ker<aggregate<unsigned int>,memory_traits_inte> nn_blocks,
openfpm::vector_gpu_ker<aggregate<indexT>,memory_traits_inte> nn_blocks,
unsigned int ghostLayerSize)