Commit d2186fe2 authored by incardon's avatar incardon
Browse files

Adding subset

parent c5d76187
Pipeline #3649 passed with stages
in 11 minutes and 22 seconds
......@@ -83,17 +83,16 @@ void test_sub_index()
no_transform_only<dim,T> t(mt,pt);
CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
spacing,
off,
t,
pl.capacity(),
pl.size(),
part_ids.capacity(),
0,
static_cast<T *>(pl.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
pl.toKernel(),
cl_n.toKernel(),
part_ids.toKernel());
cl_n.template deviceToHost<0>();
part_ids.template deviceToHost<0>();
......@@ -203,17 +202,16 @@ void test_sub_index2()
shift_only<dim,T> t(mt,pt);
CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
spacing,
off,
t,
pl.capacity(),
pl.size(),
part_ids.capacity(),
0,
static_cast<T *>(pl.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
pl.toKernel(),
cl_n.toKernel(),
part_ids.toKernel());
cl_n.template deviceToHost<0>();
part_ids.template deviceToHost<0>();
......@@ -387,11 +385,10 @@ void test_fill_cell()
div_c,
off,
part_ids.size(),
part_ids.capacity(),
0,
static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
starts.toKernel(),
part_ids.toKernel(),
cells.toKernel() );
cells.template deviceToHost<0>();
......@@ -666,7 +663,8 @@ void test_reorder_parts(size_t n_part)
// Here we test fill cell
CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
decltype(pl.toKernel()),
decltype(sort_to_not_sort.toKernel()),
decltype(sort_to_not_sort.toKernel()),
decltype(cells_out.toKernel()),
cnt_type,
shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
parts_prp.toKernel(),
......@@ -675,7 +673,7 @@ void test_reorder_parts(size_t n_part)
pl_out.toKernel(),
sort_to_not_sort.toKernel(),
non_sort_to_sort.toKernel(),
static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
cells_out.toKernel());
bool check = true;
parts_prp_out.template deviceToHost<0>();
......
......@@ -261,13 +261,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
spacing_c,
off,
this->getTransform(),
pl.capacity(),
pl.size(),
part_ids.capacity(),
start,
static_cast<T *>(pl.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
pl.toKernel(),
starts.toKernel(),
part_ids.toKernel());
// now we construct the cells
......@@ -306,6 +304,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
decltype(pl.toKernel()),
decltype(sorted_to_not_sorted.toKernel()),
decltype(cells.toKernel()),
cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
pl_prp.toKernel(),
pl_prp_out.toKernel(),
......@@ -313,7 +312,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
pl_out.toKernel(),
sorted_to_not_sorted.toKernel(),
non_sorted_to_sorted.toKernel(),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
cells.toKernel());
if (opt == cl_construct_opt::Full)
{
......@@ -391,13 +390,11 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
spacing_c,
off,
this->getTransform(),
pl.capacity(),
stop,
part_ids.capacity(),
start,
static_cast<T *>(pl.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
pl.toKernel(),
cl_n.toKernel(),
part_ids.toKernel());
// now we scan
starts.resize(cl_n.size());
......@@ -413,7 +410,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
part_ids.size(),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
cells.toKernel()) );
// sort
......@@ -422,14 +419,13 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
#else
CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
div_c,
off,
part_ids.size(),
part_ids.capacity(),
start,
static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
div_c,
off,
part_ids.size(),
start,
starts.toKernel(),
part_ids.toKernel(),
cells.toKernel() );
#endif
......@@ -445,6 +441,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
decltype(pl.toKernel()),
decltype(sorted_to_not_sorted.toKernel()),
decltype(cells.toKernel()),
cnt_type,shift_ph<0,cnt_type>>),ite,sorted_to_not_sorted.size(),
pl_prp.toKernel(),
pl_prp_out.toKernel(),
......@@ -452,7 +449,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
pl_out.toKernel(),
sorted_to_not_sorted.toKernel(),
non_sorted_to_sorted.toKernel(),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
cells.toKernel());
}
else
{
......@@ -460,6 +457,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()),
decltype(pl.toKernel()),
decltype(sorted_to_not_sorted.toKernel()),
decltype(cells.toKernel()),
cnt_type,shift_ph<0,cnt_type>,prp...>),ite,sorted_to_not_sorted.size(),
pl_prp.toKernel(),
pl_prp_out.toKernel(),
......@@ -467,7 +465,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
pl_out.toKernel(),
sorted_to_not_sorted.toKernel(),
non_sorted_to_sorted.toKernel(),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
cells.toKernel());
}
if (opt == cl_construct_opt::Full)
......
......@@ -225,18 +225,17 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid)
#ifdef __NVCC__
template<bool is_sparse,unsigned int dim, typename pos_type,
typename cnt_type, typename ids_type, typename transform>
typename cnt_type, typename ids_type, typename transform,
typename vector_pos_type, typename vector_cnt_type, typename vector_pids_type>
__global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
openfpm::array<pos_type,dim,cnt_type> spacing,
openfpm::array<ids_type,dim,cnt_type> off,
transform t,
int n_cap,
int n_part,
int n_cap2,
cnt_type start,
pos_type * p_pos,
cnt_type *counts,
cnt_type * p_ids)
vector_pos_type p_pos,
vector_cnt_type counts,
vector_pids_type p_ids)
{
cnt_type i, cid, ins;
ids_type e[dim+1];
......@@ -248,23 +247,23 @@ __global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
pos_type p[dim];
for (size_t k = 0 ; k < dim ; k++)
{p[k] = p_pos[i+k*n_cap];}
{p[k] = p_pos.template get<0>(i)[k];}
cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e);
if (is_sparse == false)
{
e[dim] = atomicAdd(counts + cid, 1);
e[dim] = atomicAdd(&counts.template get<0>(cid), 1);
p_ids[ins] = cid;
{p_ids[ins+1*(n_cap2)] = e[dim];}
p_ids.template get<0>(ins)[0] = cid;
{p_ids.template get<0>(ins)[1] = e[dim];}
}
else
{
p_ids[ins] = cid;
{p_ids[ins+1*(n_cap2)] = e[dim];}
p_ids.template get<0>(ins)[0] = cid;
{p_ids.template get<0>(ins)[1] = e[dim];}
counts[ins] = cid;
counts.template get<0>(ins) = cid;
}
}
......@@ -320,43 +319,42 @@ __global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div
#ifdef MAKE_CELLLIST_DETERMINISTIC
template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
template<unsigned int dim, typename cnt_type, typename ids_type, typename ph, typename vector_cells_type>
__global__ void fill_cells(cnt_type phase_id ,
cnt_type n,
cnt_type *cells)
vector_cells_type cells)
{
cnt_type i;
i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n) return;
cells[i] = encode_phase_id<cnt_type,ph>(phase_id,i);
cells.template get<0>(i) = encode_phase_id<cnt_type,ph>(phase_id,i);
}
#else
template<unsigned int dim, typename cnt_type, typename ids_type, typename ph>
template<unsigned int dim, typename cnt_type, typename ids_type, typename ph,typename vector_starts_type, typename vector_pids_type, typename vector_cells_type>
__global__ void fill_cells(cnt_type phase_id ,
openfpm::array<ids_type,dim,cnt_type> div_c,
openfpm::array<ids_type,dim,cnt_type> off,
cnt_type n,
cnt_type n_cap,
cnt_type start_p,
const cnt_type *starts,
const cnt_type * p_ids,
cnt_type *cells)
vector_starts_type starts,
vector_pids_type p_ids,
vector_cells_type cells)
{
cnt_type i, cid, id, start;
i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n) return;
cid = p_ids[i];
cid = p_ids.template get<0>(i)[0];
start = starts[cid];
id = start + p_ids[1*n_cap+i];
start = starts.template get<0>(cid);
id = start + p_ids.template get<0>(i)[1];
cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
cells.template get<0>(id) = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
}
#endif
......@@ -391,7 +389,7 @@ __device__ inline void reorder_wprp(const vector_prp & input,
output.template set<prp ...>(dst_id,input,src_id);
}
template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh>
template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh>
__global__ void reorder_parts(int n,
const vector_prp input,
vector_prp output,
......@@ -399,12 +397,12 @@ __global__ void reorder_parts(int n,
vector_pos output_pos,
vector_ns sorted_non_sorted,
vector_ns non_sorted_to_sorted,
const cnt_type * cells)
const vector_cells_type cells)
{
cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n) return;
cnt_type code = cells[i];
cnt_type code = cells.template get<0>(i);
reorder(input, output, code,i);
reorder(input_pos,output_pos,code,i);
......@@ -413,7 +411,7 @@ __global__ void reorder_parts(int n,
non_sorted_to_sorted.template get<0>(code) = i;
}
template <typename vector_prp, typename vector_pos, typename vector_ns, typename cnt_type, typename sh, unsigned int ... prp>
template <typename vector_prp, typename vector_pos, typename vector_ns, typename vector_cells_type, typename cnt_type, typename sh, unsigned int ... prp>
__global__ void reorder_parts_wprp(int n,
const vector_prp input,
vector_prp output,
......@@ -421,12 +419,12 @@ __global__ void reorder_parts_wprp(int n,
vector_pos output_pos,
vector_ns sorted_non_sorted,
vector_ns non_sorted_to_sorted,
const cnt_type * cells)
const vector_cells_type cells)
{
cnt_type i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n) return;
cnt_type code = cells[i];
cnt_type code = cells.template get<0>(i);
reorder_wprp<vector_prp,cnt_type,prp...>(input, output, code,i);
reorder(input_pos,output_pos,code,i);
......
#ifndef VECTOR_SUBSET_HPP
#define VECTOR_SUBSET_HPP
namespace openfpm
{
template<unsigned int dim,
typename prop,
template<typename> class layout_base = memory_traits_inte>
class vector_subset_ker
{
mutable openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> v_all;
mutable openfpm::vector_gpu_ker<aggregate<int>,layout_base> indexes;
public:
vector_subset_ker(openfpm::vector_gpu_ker<typename apply_transform<layout_base,prop>::type,layout_base> & v_all,
openfpm::vector_gpu_ker<aggregate<int>,layout_base> & indexes)
:v_all(v_all),indexes(indexes)
{}
// get the
template <unsigned int p>
__device__ __host__ inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
{
return v_all.template get<p>(indexes.template get<0>(id));
}
}
public:
template<typename T, typename Memory, template<typename> class layout_base, typename grow_p, unsigned int impl>
class vector_sub
{
vector<T,Memory,layout_base,grow_p,impl> & v_all;
vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes;
public:
vector(vector<T,Memory,layout_base,grow_p,impl> & v_all,
vector<aggregate<int>,Memory,layout_base,grow_p,impl> & indexes)
:v_all(v_all),indexes(indexes)
{}
// To kernel
template<unsigned int ... prp> vector_dist_ker<dim,St,prop,layout_base> toKernel()
{
vector_subset_ker<dim,St,prop,layout_base> v(v_all.toKernel(), indexes.toKernel());
return v;
}
template <unsigned int p>
inline auto get(size_t id) const -> decltype(v_all.template get<p>(0))
{
return v_all.template get<p>(indexes.template get<0>(id));
}
}
};
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment