Commit 7eb370cf authored by incardon's avatar incardon

Small additions for GPU optimizations

parent e7f888c1
......@@ -30,7 +30,69 @@
* \tparam encap dst
*
*/
template<typename e_src, typename e_dst, unsigned int ... prp>
struct copy_cpu_encap_encap_prp
{
//! encapsulated source object
const e_src & src;
//! encapsulated destination object
e_dst & dst;
typedef typename to_boost_vmpl<prp...>::type vprp;
/*! \brief constructor
*
* \param src source encapsulated object
* \param dst source encapsulated object
*
*/
__device__ __host__ inline copy_cpu_encap_encap_prp(const e_src & src, e_dst & dst)
:src(src),dst(dst)
{
#ifdef SE_CLASS1
// e_src and e_dst must have the same number of properties
if (e_src::T_type::max_prop != e_dst::T_type::max_prop)
std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " the number of properties between src and dst must match";
#endif
};
#ifdef SE_CLASS1
/*! \brief Constructor
*
* Calling this constructor produce an error. This class store the reference of the object,
* this mean that the object passed must not be a temporal object
*
*/
inline copy_cpu_encap_encap_prp(const e_src && src, const e_dst && dst)
:src(src),dst(dst)
{std::cerr << "Error: " <<__FILE__ << ":" << __LINE__ << " Passing a temporal object";};
#endif
//! It call the copy function for each property
template<typename T>
__device__ __host__ inline void operator()(T& t) const
{
typedef typename boost::mpl::at<vprp,T>::type prp_cp;
// Remove the reference from the type to copy
typedef typename boost::remove_reference<decltype(dst.template get<prp_cp::value>())>::type copy_rtype;
meta_copy<copy_rtype>::meta_copy_(src.template get<prp_cp::value>(),dst.template get<prp_cp::value>());
}
};
/*! \brief this class is a functor for "for_each" algorithm
*
* This class is a functor for "for_each" algorithm. For each
* element of the boost::vector the operator() is called.
* Is mainly used to copy one encap into another encap object
*
* \tparam encap source
* \tparam encap dst
*
*/
template<typename e_src, typename e_dst>
struct copy_cpu_encap_encap
{
......
......@@ -216,6 +216,13 @@ struct grid_gpu_ker
this->get_o(key1) = g.get_o(key2);
}
template<unsigned int ... prp> __device__ inline void set(const grid_key_dx<dim> & key1,const grid_gpu_ker<dim,T_,layout_base> & g, const grid_key_dx<dim> & key2)
{
copy_cpu_encap_encap_prp<decltype(this->get_o(key1)),decltype(g.get_o(key2)),prp...> ec(this->get_o(key1),g.get_o(key2));
boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(ec);
}
/*! \brief set an element of the grid
*
* set an element of the grid
......
......@@ -497,6 +497,7 @@ void test_reorder_parts(size_t n_part)
openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> cells_out;
openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> starts;
openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> sort_to_not_sort;
openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> non_sort_to_sort;
openfpm::vector<aggregate<ids_type[dim+1]>,CudaMemory,typename memory_traits_inte<aggregate<ids_type[dim+1]>>::type,memory_traits_inte> part_ids;
openfpm::vector<aggregate<float,float,float[3],float[3][3]>,CudaMemory,typename memory_traits_inte<aggregate<float,float,float[3],float[3][3]>>::type,memory_traits_inte> parts_prp;
......@@ -531,6 +532,7 @@ void test_reorder_parts(size_t n_part)
parts_prp_out.resize(n_part);
pl_out.resize(n_part);
sort_to_not_sort.resize(n_part);
non_sort_to_sort.resize(n_part);
auto p_it = parts_prp.getIterator();
while (p_it.isNext())
......@@ -581,6 +583,7 @@ void test_reorder_parts(size_t n_part)
pl.toKernel(),
pl_out.toKernel(),
sort_to_not_sort.toKernel(),
non_sort_to_sort.toKernel(),
static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
bool check = true;
......
......@@ -23,7 +23,7 @@ constexpr int count = 0;
constexpr int start = 1;
template<unsigned int dim, typename T, typename Memory, typename transform = no_transform_only<dim,T>, typename cnt_type = unsigned int, typename ids_type = unsigned char>
template<unsigned int dim, typename T, typename Memory, typename transform = no_transform_only<dim,T>, typename cnt_type = unsigned int, typename ids_type = unsigned short>
class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
{
typedef openfpm::vector<aggregate<cnt_type>,Memory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> vector_cnt_type;
......@@ -43,6 +43,9 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
//! \brief for each sorted index it show the index in the unordered
vector_cnt_type sorted_to_not_sorted;
//! \brief for each non sorted index it show the index in the ordered vector
vector_cnt_type non_sorted_to_sorted;
//! Spacing
openfpm::array<T,dim,cnt_type> spacing_c;
......@@ -161,6 +164,11 @@ public:
return sorted_to_not_sorted;
}
vector_cnt_type & getNonSortedToSorted()
{
return non_sorted_to_sorted;
}
/*! \brief construct from a list of particles
*
* \warning pl is assumed to be already be in device memory
......@@ -211,6 +219,7 @@ public:
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
sorted_to_not_sorted.resize(pl.size());
non_sorted_to_sorted.resize(pl.size());
auto ite = pl.getGPUIterator();
// Here we test fill cell
......@@ -223,6 +232,7 @@ public:
pl.toKernel(),
pl_out.toKernel(),
sorted_to_not_sorted.toKernel(),
non_sorted_to_sorted.toKernel(),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
......
......@@ -59,6 +59,38 @@ class NN_gpu_it
}
}
__device__ void SelectValid_debug()
{
while (p_id >= starts.template get<0>(c_id+1) && isNext())
{
cnt_type id = cell_act.get(0);
cell_act.set_d(0,id+1);
//! check the overflow of all the index with exception of the last dimensionality
int i = 0;
for ( ; i < dim-1 ; i++)
{
size_t id = cell_act.get(i);
if ((int)id > cell_stop.get(i))
{
// ! overflow, increment the next index
cell_act.set_d(i,cell_start.get(i));
id = cell_act.get(i+1);
cell_act.set_d(i+1,id+1);
}
else
{
break;
}
}
c_id = cid_<dim,cnt_type,ids_type,int>::get_cid(div_c,cell_act);
p_id = starts.template get<0>(c_id);
}
}
public:
__device__ NN_gpu_it(const grid_key_dx<dim,ids_type> & cell_pos,
......@@ -102,6 +134,25 @@ public:
return *this;
}
__device__ NN_gpu_it<dim,cnt_type,ids_type> & plusplus()
{
++p_id;
SelectValid_debug();
return *this;
}
__device__ cnt_type get_start(unsigned int ce_id)
{
return starts.template get<0>(ce_id);
}
__device__ cnt_type get_cid()
{
return c_id;
}
__device__ bool isNext()
{
return cell_act.get(dim-1) <= cell_stop.get(dim-1);
......
......@@ -275,6 +275,7 @@ __global__ void reorder_parts(int n,
const vector_pos input_pos,
vector_pos output_pos,
vector_ns sorted_non_sorted,
vector_ns non_sorted_to_sorted,
const cnt_type * cells)
{
cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
......@@ -285,6 +286,7 @@ __global__ void reorder_parts(int n,
reorder(input_pos,output_pos,code,i);
sorted_non_sorted.template get<0>(i) = code;
non_sorted_to_sorted.template get<0>(code) = i;
}
template<typename T>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment