Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mosaic/software/parallel-computing/openfpm/openfpm_data
  • argupta/openfpm_data
2 results
Show changes
Commits on Source (427)
Showing
with 641 additions and 74 deletions
......@@ -36,7 +36,10 @@ find_package(Vc REQUIRED)
if (CUDA_ON_BACKEND STREQUAL "HIP" AND NOT HIP_FOUND)
find_package(HIP)
endif()
find_package(OpenMP)
if (NOT CUDA_ON_BACKEND STREQUAL "HIP")
find_package(OpenMP)
endif()
###### CONFIG.h FILE ######
......
......@@ -93,6 +93,8 @@ public:
}
}
static bool noPointers() {return true;}
#ifdef __NVCC__
//Constructors from dim3 and uint3 objects
__host__ __device__ grid_smb(const dim3 blockDimensions)
......@@ -340,6 +342,11 @@ public:
return sz;
}
__host__ __device__ inline indexT getBlockEgdeSize() const
{
return blockEdgeSize;
}
__host__ __device__ inline indexT getBlockSize() const
{
return blockSize;
......
......@@ -141,6 +141,11 @@ public:
grid_smb<dim,blockEdgeSize,indexT>::swap(other);
}
__host__ __device__ inline indexT getBlockEgdeSize() const
{
return blockEdgeSize;
}
__host__ __device__ inline indexT size() const
{
return grid_smb<dim,blockEdgeSize,indexT>::size();
......
......@@ -34,7 +34,7 @@ template<unsigned int dim>
struct comb
{
//! Array that store the combination
char c[dim];
signed char c[dim];
/*! \brief check if it is a valid combination
*
......@@ -105,7 +105,7 @@ struct comb
* \return Result combination
*
*/
inline comb<dim> operator&(char c_)
inline comb<dim> operator&(signed char c_)
{
comb<dim> ret;
......@@ -246,7 +246,7 @@ struct comb
*
*/
inline char operator[](int i) const
inline signed char operator[](int i) const
{
return c[i];
}
......@@ -257,7 +257,7 @@ struct comb
*
*/
inline char * getComb()
inline signed char * getComb()
{
return c;
}
......@@ -268,7 +268,7 @@ struct comb
*
*/
inline const char * getComb() const
inline const signed char * getComb() const
{
return c;
}
......@@ -282,7 +282,7 @@ struct comb
* \return value of the i index
*
*/
inline char value(int i) const
inline signed char value(int i) const
{
return c[i];
}
......@@ -314,10 +314,10 @@ struct comb
* \param c list of numbers
*
*/
comb(std::initializer_list<char> c)
comb(std::initializer_list<signed char> c)
{
size_t i = 0;
for(char x : c)
for(signed char x : c)
{this->c[c.size() - i - 1] = x;i++;}
}
......@@ -397,7 +397,7 @@ template<>
struct comb<0>
{
//! FIX
char c[0];
signed char c[0];
/*! \brief check if it is a valid combination
*
......@@ -467,7 +467,7 @@ struct comb<0>
*
*/
inline char operator[](int i)
inline signed char operator[](int i)
{
return 0;
}
......@@ -478,7 +478,7 @@ struct comb<0>
*
*/
inline char * getComb()
inline signed char * getComb()
{
return c;
}
......@@ -492,7 +492,7 @@ struct comb<0>
* \return value of the i index
*
*/
inline char value(int i) const
inline signed char value(int i) const
{
return c[i];
}
......
......@@ -94,12 +94,12 @@ __global__ void copy_ndim_grid_device(grid_type src, grid_type dst)
#endif
template<bool inte_or_lin,unsigned int dim, typename T>
template<bool inte_or_lin, typename base_grid, unsigned int dim, typename T>
struct grid_toKernelImpl
{
template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type> toKernel(grid_type & gc)
template<typename grid_type> static base_grid toKernel(grid_type & gc)
{
grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type> g(gc.getGrid());
/*grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type>*/base_grid g(gc.getGrid());
g.get_data_().disable_manage_memory();
g.get_data_().mem = gc.get_internal_data_().mem;
......@@ -112,12 +112,12 @@ struct grid_toKernelImpl
}
};
template<unsigned int dim, typename T>
struct grid_toKernelImpl<true,dim,T>
template<typename base_grid, unsigned int dim, typename T>
struct grid_toKernelImpl<true,base_grid,dim,T>
{
template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type> toKernel(grid_type & gc)
template<typename grid_type> static base_grid toKernel(grid_type & gc)
{
grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type> g(gc.getGrid());
/*grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type>*/ base_grid g(gc.getGrid());
copy_switch_memory_c_no_cpy<typename std::remove_reference<decltype(gc.get_internal_data_())>::type,
typename std::remove_reference<decltype(g.get_data_())>::type> cp_mc(gc.get_internal_data_(),g.get_data_());
......
......@@ -160,6 +160,70 @@ BOOST_AUTO_TEST_CASE (gpu_computation)
#endif
}
BOOST_AUTO_TEST_CASE (gpu_computation_lambda)
{
#ifdef CUDA_GPU
{
size_t sz[3] = {64,64,64};
grid_gpu<3, aggregate<float,float[2],float[2][2]> > c3(sz);
c3.setMemory();
// Assign
auto c3_k = c3.toKernel();
auto lamb = [c3_k] __device__ (dim3 & blockIdx, dim3 & threadIdx)
{
grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z});
c3_k.template get<0>(p) = 5.0;
c3_k.template get<1>(p)[0] = 5.0;
c3_k.template get<1>(p)[1] = 5.0;
c3_k.template get<2>(p)[0][0] = 5.0;
c3_k.template get<2>(p)[0][1] = 5.0;
c3_k.template get<2>(p)[1][0] = 5.0;
c3_k.template get<2>(p)[1][1] = 5.0;
};
auto ite = c3.getGPUIterator({0,0,0},{63,63,63});
CUDA_LAUNCH_LAMBDA(ite,lamb);
c3.deviceToHost<0,1,2>();
auto it = c3.getIterator();
bool good = true;
while(it.isNext())
{
auto key = it.get();
good &= c3.template get<0>(key) == 5.0;
good &= c3.template get<1>(key)[0] == 5.0;
good &= c3.template get<1>(key)[1] == 5.0;
good &= c3.template get<2>(key)[0][0] == 5.0;
good &= c3.template get<2>(key)[0][1] == 5.0;
good &= c3.template get<2>(key)[1][0] == 5.0;
good &= c3.template get<2>(key)[1][1] == 5.0;
++it;
}
BOOST_REQUIRE_EQUAL(good,true);
}
#endif
}
BOOST_AUTO_TEST_CASE (gpu_computation_stencil)
{
#ifdef CUDA_GPU
......
......@@ -158,7 +158,7 @@ class grid_gpu_ker
typedef typename layout_base<T_>::type layout;
//! layout data
layout data_;
mutable layout data_;
......@@ -264,14 +264,32 @@ public:
* \return the const reference of the element
*
*/
template <unsigned int p, typename ids_type, typename r_type=decltype(layout_base<T_>::template get_c<p>(data_,g1,grid_key_dx<dim>()))>
__device__ __host__ inline const r_type get(const grid_key_dx<dim,ids_type> & v1) const
template <unsigned int p, typename ids_type, typename r_type=decltype(layout_base<T_>::template get<p>(data_,g1,grid_key_dx<dim>()))>
__device__ __host__ inline r_type get_debug(const grid_key_dx<dim,ids_type> & v1) const
{
#ifdef SE_CLASS1
if (check_bound(v1) == false)
{fill_grid_error_array_overflow<dim,p>(this->template getPointer<p>(),v1);}
#endif
return layout_base<T_>::template get_c<p>(data_,g1,v1);
return layout_base<T_>::template get<p>(data_,g1,v1);
}
/*! \brief Get the const reference of the selected element
*
* \param v1 grid_key that identify the element in the grid
*
* \return the const reference of the element
*
*/
template <unsigned int p, typename ids_type, typename r_type=decltype(layout_base<T_>::template get<p>(data_,g1,grid_key_dx<dim>()))>
__device__ __host__ inline r_type get(const grid_key_dx<dim,ids_type> & v1) const
{
#ifdef SE_CLASS1
if (check_bound(v1) == false)
{fill_grid_error_array_overflow<dim,p>(this->template getPointer<p>(),v1);}
#endif
return layout_base<T_>::template get<p>(data_,g1,v1);
}
/*! \brief Get the reference of the selected element
......
......@@ -620,7 +620,7 @@ private:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
#endif
}
......@@ -705,8 +705,8 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
g.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(g);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
g.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(g);
#endif
......@@ -726,7 +726,7 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
#endif
......@@ -809,7 +809,7 @@ public:
* \param stop end point
*
*/
struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
struct ite_gpu<dim> getGPUIterator(const grid_key_dx<dim,long int> & key1, const grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
{
return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
}
......@@ -850,12 +850,25 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
#endif
}
/*! \brief Return the memory object
*
* Return the memory object
*
* \tparam p array to retrieve
*
*/
template<unsigned int p>
auto getMemory() -> decltype(boost::fusion::at_c<p>(data_).getMemory())
{
return boost::fusion::at_c<p>(data_).getMemory();
}
/*! \brief Set the object that provide memory from outside
*
* An external allocator is useful with allocator like PreAllocHeapMem
......@@ -880,7 +893,7 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
#endif
}
......@@ -909,7 +922,7 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
#endif
}
......@@ -1506,7 +1519,7 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
#endif
}
......@@ -1536,8 +1549,8 @@ public:
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
grid.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(grid);
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
grid.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(grid);
#endif
}
......@@ -1876,7 +1889,7 @@ public:
*/
grid_gpu_ker<dim,T_,layout_base,linearizer_type> toKernel()
{
return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
}
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
......@@ -1886,7 +1899,7 @@ public:
*/
const grid_gpu_ker<dim,T_,layout_base,linearizer_type> toKernel() const
{
return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
}
#endif
......
......@@ -108,12 +108,12 @@ struct ite_gpu
grid_key_dx<dim,int> start;
grid_key_dx<dim,int> stop;
size_t nblocks()
size_t nblocks() const
{
return wthr.x * wthr.y * wthr.z;
}
size_t nthrs()
size_t nthrs() const
{
return thr.x * thr.y * thr.z;
}
......@@ -365,6 +365,25 @@ public:
}
}
/*! \brief copy constructor
*
* \param g grid info
*
* construct a grid from another grid. As a copy constructor can't be template,
* the one above won't work for const grid_sm<N,S> & g, where S=T
*
*/
__device__ __host__ inline grid_sm(const grid_sm<N,T> & g)
{
size_tot = g.size_tot;
for (size_t i = 0 ; i < N ; i++)
{
sz[i] = g.sz[i];
sz_s[i] = g.sz_s[i];
}
}
// Static element to calculate total size
inline size_t totalSize(const size_t sz)
......@@ -432,7 +451,7 @@ public:
*/
template<typename check=NoCheck, typename ids_type>
inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const char sum_id[N]) const
inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const signed char sum_id[N]) const
{
mem_id lid;
......@@ -468,7 +487,7 @@ public:
*/
template<typename check=NoCheck,typename ids_type>
inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const char sum_id[N], const size_t (&bc)[N]) const
inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const signed char sum_id[N], const size_t (&bc)[N]) const
{
mem_id lid;
......
......@@ -43,7 +43,7 @@
*
*
*/
template <typename T, bool has_noPointers>
template <typename T, int has_noPointers>
struct Pack_selector_unknown_type_impl
{
enum
......@@ -53,7 +53,7 @@ struct Pack_selector_unknown_type_impl
};
template <typename T>
struct Pack_selector_unknown_type_impl<T,false>
struct Pack_selector_unknown_type_impl<T,0>
{
enum
{
......@@ -61,6 +61,28 @@ struct Pack_selector_unknown_type_impl<T,false>
};
};
/*! \brief Pack selector for unknown type
*
*
*/
template <typename T>
struct Pack_selector_unknown_type_impl<T,2>
{
enum
{
value = PACKER_ARRAY_CP_PRIMITIVE
};
};
template <typename T>
struct Pack_selector_unknown_type_impl<T,3>
{
enum
{
value = PACKER_ARRAY_CP_PRIMITIVE
};
};
/*! \brief Pack selector for unknown type
*
*
......@@ -70,7 +92,7 @@ struct Pack_selector_known_type_impl
{
enum
{
value = Pack_selector_unknown_type_impl<T, has_noPointers<T>::value >::value
value = Pack_selector_unknown_type_impl<T, has_noPointers<T>::value + 2*std::is_array<T>::value >::value
};
};
......
......@@ -228,7 +228,7 @@ template<unsigned int dim ,typename T> class Point
* \return the norm of the vector
*
*/
__device__ __host__ T norm()
__device__ __host__ T norm() const
{
T n = 0.0;
......
......@@ -14,6 +14,29 @@ using BlockTypeOf = typename std::remove_reference<typename boost::fusion::resul
template<typename AggregateT, unsigned int p>
using ScalarTypeOf = typename std::remove_reference<typename boost::fusion::result_of::at_c<typename AggregateT::type, p>::type>::type::scalarType;
template<typename T>
struct meta_copy_set_bck
{
template<typename destType>
inline static void set(destType & bP ,T & backgroundValue, int j)
{
bP[j] = backgroundValue;
}
};
template<unsigned int N, typename T>
struct meta_copy_set_bck<T[N]>
{
template<typename destType>
inline static void set(destType & bP ,T * backgroundValue, int j)
{
for (int i = 0 ; i < N ; i++)
{
bP[i][j] = backgroundValue[i];
}
}
};
template<typename AggregateBlockT, unsigned int threadBlockSize=128, typename indexT=long int, template<typename> class layout_base=memory_traits_inte>
class BlockMapGpu
{
......@@ -280,19 +303,21 @@ public:
* \tparam p property p
*
*/
template<unsigned int p>
void setBackgroundValue(ScalarTypeOf<AggregateBlockT, p> backgroundValue)
template<unsigned int p, typename TypeBck>
void setBackgroundValue(TypeBck backgroundValue)
{
// NOTE: Here we assume user only passes Blocks and not scalars in the templated aggregate type
typedef BlockTypeOf<AggregateInternalT, p> BlockT;
typedef typename std::remove_all_extents<BlockTypeOf<AggregateInternalT, p>>::type BlockT_noarr;
typedef BlockTypeOf<AggregateInternalT, pMask> BlockM;
BlockT bP;
BlockM bM;
for (unsigned int i = 0; i < BlockT::size; ++i)
for (unsigned int i = 0; i < BlockT_noarr::size; ++i)
{
bP[i] = backgroundValue;
meta_copy_set_bck<TypeBck>::set(bP,backgroundValue,i);
//meta_copy<TypeBck>::meta_copy_(backgroundValue,bP[][i]);
bM[i] = 0;
}
......
......@@ -718,6 +718,8 @@ public:
typedef sparse_grid_gpu_index<self> base_key;
typedef indexT indexT_;
typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
/*! \brief return the size of the grid
......@@ -1045,6 +1047,7 @@ private:
}
template <typename stencil, typename... Args>
void applyStencilInPlaceNoShared(const Box<dim,int> & box, StencilMode & mode,Args... args)
{
......@@ -1795,6 +1798,11 @@ public:
void setMemory()
{}
auto insertBlockFlush(size_t block) -> decltype(BMG::insertBlockFlush(block))
{
return BMG::insertBlockFlush(block);
}
/*! \brief Return the grid information object
*
* \return grid information object
......@@ -2628,6 +2636,28 @@ public:
applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
}
/*! \brief Apply a free type convolution using blocks
*
*
*/
template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_src3,
unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int prop_dst3,
unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
void conv3_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
{
Box<dim,int> box;
for (int i = 0 ; i < dim ; i++)
{
box.setLow(i,start.get(i));
box.setHigh(i,stop.get(i));
}
constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
}
/*! \brief Apply a free type convolution using blocks
*
*
......@@ -2753,12 +2783,37 @@ public:
typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;
auto block_data = this->insertBlockFlush(block_id);
auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);
block_data.template get<BMG::pMask>()[local_id] = 1;
return block_data.template get<p>()[local_id];
}
/*! \brief Insert the point on host side and flush directly
*
* First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
*
* \param grid point where to insert
*
* \return a reference to the data to fill
*
*
*/
template<typename CoordT>
auto insertBlockFlush(const grid_key_dx<dim,CoordT> &coord, indexT & local_id) -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(0))
{
auto lin = gridGeometry.LinId(coord);
indexT block_id = lin / blockSize;
local_id = lin % blockSize;
typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;
auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);
block_data.template get<BMG::pMask>()[local_id] = 1;
return block_data;
}
/*! \brief Insert the point on host side and flush directly
*
* First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
......@@ -2779,7 +2834,7 @@ public:
typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;
auto block_data = this->insertBlockFlush(block_id);
auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);
block_data.template get<BMG::pMask>()[local_id] = 1;
return block_data.template get<p>()[local_id];
......@@ -2817,15 +2872,105 @@ public:
*
*/
template<unsigned int p>
void setBackgroundValue(ScalarTypeOf<AggregateBlockT, p> backgroundValue)
void setBackgroundValue(typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
{
bck.template get<p>() = backgroundValue;
meta_copy<typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>::meta_copy_(backgroundValue,bck.template get<p>());
BMG::template setBackgroundValue<p>(backgroundValue);
BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
}
/////////////////////////////////// DISTRIBUTED INTERFACE ///////////////////////
//Functions to check if the packing object is complex
static bool pack()
{
return true;
}
//Functions to check if the packing object is complex
static bool packRequest()
{
return true;
}
/*! \brief Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory
*
*
*/
template<int ... prp> inline
void packRequest(size_t & req) const
{
// To fill
auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
indexBuffer.template packRequest<prp ...>(req);
dataBuffer.template packRequest<prp ...>(req);
Packer<decltype(gridGeometry),HeapMemory>::packRequest(req);
}
/*! \brief Pack the object into the memory
*
* \tparam prp properties to pack
*
* \param mem preallocated memory where to pack the objects
*
* \param sts pack statistic
*
*/
template<int ... prp> void pack(ExtPreAlloc<HeapMemory> & mem,
Pack_stat & sts) const
{
auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
// To fill
indexBuffer.template pack<prp ...>(mem,sts);
dataBuffer.template pack<prp ...>(mem,sts);
Packer<decltype(gridGeometry),HeapMemory>::pack(mem,gridGeometry,sts);
}
/*! \brief Unpack the object into the memory
*
* \tparam prp properties to pack
*
* \param mem preallocated memory where to pack the objects
*
* \param sts pack statistic
*
*/
template<int ... prp> void unpack(ExtPreAlloc<HeapMemory> & mem,
Unpack_stat & ps)
{
auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
// To fill
indexBuffer.template unpack<prp ...>(mem,ps);
dataBuffer.template unpack<prp ...>(mem,ps);
Unpacker<decltype(gridGeometry),HeapMemory>::unpack(mem,gridGeometry,ps);
}
/*! \brief Unpack the object into the memory
*
* \tparam prp properties to pack
*
* \param mem preallocated memory where to pack the objects
*
* \param sts pack statistic
*
*/
template<int ... prp> void unpack(ExtPreAlloc<CudaMemory> & mem,
Unpack_stat & ps)
{
if (mem.size() != 0)
{std::cout << __FILE__ << ":" << __LINE__ << " not implemented: " << std::endl;}
}
/*! \brief memory requested to pack this object
*
* \param req request
......@@ -3253,6 +3398,8 @@ public:
*/
void swap(self & gr)
{
gridGeometry.swap(gr.gridGeometry);
BMG::swap(gr);
}
......@@ -3422,6 +3569,7 @@ public:
result.allocate(sizeof(int));
if (pointers.size())
CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
pointers.toKernel(),
headers.toKernel(),
......
......@@ -158,6 +158,19 @@ namespace SparseGridGpuKernels
{
f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
}
template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
__device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
CpBlockType & cpb1,
CpBlockType & cpb2,
CpBlockType & cpb3,
DataBlockWrapperT & DataBlockLoad,
int offset,
lambda_func f,
ArgsT ... args)
{
f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
}
};
template<>
......@@ -204,6 +217,19 @@ namespace SparseGridGpuKernels
{
f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
}
template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
__device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
CpBlockType & cpb1,
CpBlockType & cpb2,
CpBlockType & cpb3,
DataBlockWrapperT & DataBlockLoad,
int offset,
lambda_func f,
ArgsT ... args)
{
f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1]);
}
};
template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
......@@ -424,6 +450,86 @@ namespace SparseGridGpuKernels
}
};
template<unsigned int dim, unsigned int n_loop,
unsigned int p_src1, unsigned int p_src2, unsigned int p_src3,
unsigned int p_dst1, unsigned int p_dst2, unsigned int p_dst3,
unsigned int stencil_size>
struct stencil_func_conv3_b
{
typedef NNStar<dim> stencil_type;
static constexpr unsigned int supportRadius = stencil_size;
template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
static inline __device__ void stencil(
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
openfpm::sparse_index<unsigned int> dataBlockIdPos,
unsigned int offset,
grid_key_dx<dim, int> & pointCoord,
DataBlockWrapperT & dataBlockLoad,
DataBlockWrapperT & dataBlockStore,
unsigned char curMask,
lambda_func f,
ArgT ... args)
{
typedef typename SparseGridT::AggregateBlockType AggregateT;
typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
typedef ScalarTypeOf<AggregateT, p_src1> ScalarT3;
constexpr unsigned int enlargedBlockSize = IntPow<
SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
__shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
__shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
__shared__ ScalarT3 enlargedBlock3[enlargedBlockSize];
// fill with background
typedef typename vmpl_create_constant<dim,SparseGridT::blockEdgeSize_>::type block_sizes;
typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
cp_block<ScalarT1,stencil_size,vmpl_sizes,dim> cpb1(enlargedBlock1);
cp_block<ScalarT2,stencil_size,vmpl_sizes,dim> cpb2(enlargedBlock2);
cp_block<ScalarT3,stencil_size,vmpl_sizes,dim> cpb3(enlargedBlock3);
sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
sparseGrid.template loadGhostBlock<p_src3>(dataBlockLoad, dataBlockIdPos, enlargedBlock3);
__syncthreads();
ScalarT1 res1 = 0;
ScalarT2 res2 = 0;
ScalarT3 res3 = 0;
if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
{
int coord[dim];
unsigned int linIdTmp = offset;
for (unsigned int d = 0; d < dim; ++d)
{
coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
linIdTmp /= SparseGridT::blockEdgeSize_;
}
stencil_conv_func_impl<dim>::stencil3_block(res1,res2,res3,coord,cpb1,cpb2,cpb3,dataBlockLoad,offset,f,args...);
dataBlockStore.template get<p_dst1>()[offset] = res1;
dataBlockStore.template get<p_dst2>()[offset] = res2;
dataBlockStore.template get<p_dst3>()[offset] = res3;
}
}
template <typename SparseGridT, typename CtxT>
static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
{
// No flush
}
};
template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
struct stencil_func_conv2
{
......@@ -966,7 +1072,7 @@ namespace SparseGridGpuKernels
auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
// todo: Add management of RED-BLACK stencil application! :)
const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
unsigned char curMask;
......
......@@ -172,6 +172,11 @@ namespace openfpm
return v_size;
}
__host__ __device__ size_t size_local() const
{
return size();
}
/*! \brief return the maximum capacity of the vector before reallocation
*
* \return the capacity of the vector
......@@ -204,6 +209,38 @@ namespace openfpm
return base.template get<p>(key);
}
/*! \brief Get an element of the vector
*
* Get an element of the vector
*
* \tparam p Property to get
* \param id Element to get
*
* \return the element value requested
*
*/
template <unsigned int p>
__device__ __host__ inline auto getProp(unsigned int id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
{
return this->get<p>(id);
}
/*! \brief Get an element of the vector
*
* Get an element of the vector
*
* \tparam p Property to get
* \param id Element to get
*
* \return the element value requested
*
*/
template <unsigned int p, typename key_type>
__device__ __host__ inline auto getProp(key_type id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
{
return this->get<p>(id.getKey());
}
/*! \brief Get an element of the vector
*
......@@ -488,7 +525,22 @@ namespace openfpm
return base.getGPUIterator(start,stop,n_thr);
}
/*! \brief Get a domain iterator for the GPU
*
*
*/
ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
{
return getGPUIterator(n_thr);
}
//Stub for some expression
void init() const {}
__host__ __device__ auto value(unsigned int p) -> decltype(base.template get<0>(grid_key_dx<1>(0)))
{
return get<0>(p);
}
/*! \brief Get an iterator for the GPU
*
*
......@@ -501,6 +553,7 @@ namespace openfpm
return base.getGPUIterator(start,stop_,n_thr);
}
/*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers
*
* \param object to copy
......@@ -514,6 +567,16 @@ namespace openfpm
return *this;
}
__device__ __host__ vector_gpu_ker<T,layout_base> & getVector()
{
return *this;
}
__device__ __host__ const vector_gpu_ker<T,layout_base> & getVector() const
{
return *this;
}
/*! \brief Return the base
*
* \return the base
......@@ -596,6 +659,11 @@ namespace openfpm
return vref.size();
}
__host__ __device__ size_t size_local() const
{
return size();
}
__device__ __host__ unsigned int capacity() const
{
return vref.capacity;
......@@ -692,6 +760,16 @@ namespace openfpm
return vref.getGPUItertatorTo(stop,n_thr);
}
vector_gpu_ker<T,layout_base> & getVector()
{
return *this;
}
const vector_gpu_ker<T,layout_base> & getVector() const
{
return *this;
}
__host__ vector_gpu_ker_ref<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
{
vref.operator=(v);
......
......@@ -336,7 +336,7 @@ namespace openfpm
*
* \return the size
*
*/
*/ //remove host device
size_t size_local() const
{
return v_size;
......@@ -1359,7 +1359,21 @@ namespace openfpm
return base.get_o(key);
}
/*! \brief Get an element of the vector
*
* Get an element of the vector
*
* \tparam p Property to get
* \param id Element to get
*
* \return the element value requested
*
*/ //remove device host
template <unsigned int p>
inline auto getProp(const unsigned int & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
{ //uncomment this
return this->template get<p>(id);
}
/*! \brief Get an element of the vector
*
* Get an element of the vector
......@@ -1369,12 +1383,12 @@ namespace openfpm
*
* \return the element value requested
*
*/
template <unsigned int p,typename KeyType>
inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
*///remove host device
template <unsigned int p,typename KeyType>
inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
{
return this->template get<p>(id.getKey());
//uncomment this
return this->template get<p>(id.getKey());
}
/*! \brief Get an element of the vector
......@@ -1386,10 +1400,10 @@ namespace openfpm
*
* \return the element value requested
*
*/
*/ //remove device host
template <unsigned int p, typename keyType>
inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
{
inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
{ //uncomment this
return this->template get<p>(id.getKey());
}
......@@ -1930,6 +1944,7 @@ namespace openfpm
return base.getGPUIterator(start,stop_,n_thr);
}
#endif
/*! \brief Get the vector elements iterator
......@@ -1987,6 +2002,14 @@ namespace openfpm
return base.getGPUIterator(start,stop,n_thr);
}
/*! \brief Get a domain iterator for the GPU
*
*
*/
ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
{
return getGPUIterator(n_thr);
}
#endif
/*! \brief Return the size of the message needed to pack this object
......@@ -2071,6 +2094,19 @@ namespace openfpm
return 1;
}
/*! \brief Return the memory object
*
* Return the memory object
*
* \tparam p array to retrieve
*
*/
template<unsigned int p>
auto getMemory() -> decltype(base.template getMemory<p>())
{
return base.template getMemory<p>();
}
/*! \brief Set the memory of the base structure using an object
*
* \param mem Memory object to use for allocation
......
......@@ -1016,6 +1016,18 @@ public:
return base_type::get(id);
}
/*! \brief Get an element of the vector
*
* \param id element to get
*
* \return the element reference
*
*/
inline T & operator[](size_t id)
{
return base_type::get(id);
}
/*! \brief Get an element of the vector
*
* \param id element to get
......
......@@ -339,7 +339,7 @@ template<int ... prp> inline void pack(ExtPreAlloc<HeapMemory> & mem, Pack_stat
* \param mem preallocated memory from where to unpack the vector
* \param ps unpack-stat info
*/
template<int ... prp> inline void unpack(ExtPreAlloc<HeapMemory> & mem, Unpack_stat & ps)
template<int ... prp, typename MemType> inline void unpack(ExtPreAlloc<MemType> & mem, Unpack_stat & ps)
{
//if all of the aggregate properties are simple (don't have "pack()" member)
if (has_pack_agg<T,prp...>::result::value == false)
......@@ -353,7 +353,7 @@ template<int ... prp> inline void unpack(ExtPreAlloc<HeapMemory> & mem, Unpack_s
{
//Unpack a size of a source vector
size_t u2 = 0;
Unpacker<size_t, HeapMemory>::unpack(mem,u2,ps);
Unpacker<size_t, MemType>::unpack(mem,u2,ps);
//Resize a destination vector
this->resize(u2);
......@@ -361,7 +361,7 @@ template<int ... prp> inline void unpack(ExtPreAlloc<HeapMemory> & mem, Unpack_s
for (size_t i = 0 ; i < this->size() ; i++)
{
//Call an unpacker in nested way
call_aggregateUnpack<decltype(this->get(i)),HeapMemory,prp ... >::call_unpack(this->get(i),mem,ps);
call_aggregateUnpack<decltype(this->get(i)),MemType,prp ... >::call_unpack(this->get(i),mem,ps);
}
}
}
......
......@@ -24,11 +24,14 @@
#define __CUDACC__
#define __CUDA__
#else
#include "util/cuda/moderngpu/kernel_merge.hxx"
#include <thrust/merge.h>
#include <thrust/execution_policy.h>
#endif
#endif
#else
#include "util/cuda/moderngpu/kernel_merge.hxx"
#include <thrust/merge.h>
#include <thrust/execution_policy.h>
// #include "util/cuda/moderngpu/kernel_merge.hxx"
#endif
#include "util/cuda/ofp_context.hxx"
......@@ -98,7 +101,14 @@
#else
mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context);
// It seems broken on some CUDA on some hardware. Anyway is not anymore supported
// on some hardware ... we move to thrust
// mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context);
thrust::merge_by_key(thrust::device, a_keys,a_keys + a_count,
b_keys,b_keys + b_count,
a_vals,b_vals,
c_keys,c_vals,comp);
#endif
......
......@@ -27,7 +27,8 @@
#else
// Here we have old CUDA
#include "cub_old/cub.cuh"
#include "util/cuda/moderngpu/kernel_reduce.hxx"
//#include "util/cuda/moderngpu/kernel_reduce.hxx"
#define REDUCE_WITH_CUB
#endif
#include "util/cuda/ofp_context.hxx"
......@@ -98,4 +99,4 @@ namespace openfpm
#endif
#endif /* REDUCE_OFP_HPP_ */
\ No newline at end of file
#endif /* REDUCE_OFP_HPP_ */