Commit b079452e authored by incardon's avatar incardon

Latest modules

parent 560035f5
openfpm_data @ 8d576118
Subproject commit 77fc4055f2dc0ddf5e759277fd6d60c57a3af791
Subproject commit 8d5761180cd6f396c5e819df13b83ac62592754e
......@@ -87,9 +87,9 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
{return;}
// First we have to see how many entry each particle produce
num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
CUDA_LAUNCH((num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>),
ite.wthr,ite.thr,
dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
// scan
sc.scan_(proc_id_out,starts);
......@@ -104,9 +104,9 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
ite = v_pos.getGPUIterator();
// we compute processor id for each particle
proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
CUDA_LAUNCH((proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>),
ite.wthr,ite.thr,
dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
// sort particles
mergesort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
......@@ -118,8 +118,9 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
ite = g_opart_device.getGPUIterator();
// Find the buffer bases
find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
CUDA_LAUNCH((find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())>),
ite.wthr,ite.thr,
g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
// Trasfer the number of offsets on CPU
mem.deviceToHost();
......@@ -207,9 +208,9 @@ struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
if (ite.wthr.x != 0)
{
process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>
<<<ite.wthr,ite.thr>>>
(o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);
CUDA_LAUNCH((process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>),
ite.wthr,ite.thr,
o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);
}
#else
std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
......@@ -260,9 +261,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
auto ite = v_pos.getGPUIteratorTo(g_m);
// label particle processor
num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
CUDA_LAUNCH((num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>),
ite.wthr,ite.thr,
box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
starts.resize(o_part_loc.size());
mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
......@@ -280,12 +281,12 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
// resize o_part_loc
o_part_loc.resize(total);
shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
CUDA_LAUNCH((shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
decltype(starts.toKernel()),decltype(shifts.toKernel()),
decltype(o_part_loc.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),box_f_sv.toKernel(),
decltype(o_part_loc.toKernel())>),
ite.wthr,ite.thr,
box_f_dev.toKernel(),box_f_sv.toKernel(),
v_pos.toKernel(),v_prp.toKernel(),
starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
......
......@@ -353,7 +353,7 @@ void remove_marked(vector_type & vd)
auto ite = idx.getGPUIterator();
create_index<<<ite.wthr,ite.thr>>>(idx.toKernel());
CUDA_LAUNCH(create_index,ite.wthr,ite.thr,idx.toKernel());
// sort particles, so the particles to remove stay at the end
mergesort((remove_type *)vd.getPropVector().template getDeviceBuffer<prp>(),(unsigned int *)idx.template getDeviceBuffer<0>(), idx.size(), mgpu::template less_t<remove_type>(), vd.getVC().getmgpuContext());
......@@ -366,8 +366,8 @@ void remove_marked(vector_type & vd)
mem.fill(0);
// mark point, particle that stay and to remove
find_buffer_offsets_no_prc<prp,decltype(vd.getPropVector().toKernel()),decltype(mark.toKernel())><<<ite.wthr,ite.thr>>>
(vd.getPropVector().toKernel(),(int *)mem.getDevicePointer(),mark.toKernel(),vd.size_local());
CUDA_LAUNCH((find_buffer_offsets_no_prc<prp,decltype(vd.getPropVector().toKernel()),decltype(mark.toKernel())>),ite.wthr,ite.thr,
vd.getPropVector().toKernel(),(int *)mem.getDevicePointer(),mark.toKernel(),vd.size_local());
mem.deviceToHost();
......@@ -401,7 +401,7 @@ void remove_marked(vector_type & vd)
ite = vd_pos_old.getGPUIterator();
copy_new_to_old<vector_type::dims><<<ite.wthr,ite.thr>>>(vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
CUDA_LAUNCH((copy_new_to_old<vector_type::dims>),ite.wthr,ite.thr,vd_pos_new.toKernel(),vd_prp_new.toKernel(),vd_pos_old.toKernel(),vd_prp_old.toKernel(),idx.toKernel());
// and we swap
......
......@@ -1907,9 +1907,9 @@ public:
auto ite = v_pos.getGPUIteratorTo(g_m,n_thr);
merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortToSort().toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortToSort().toKernel());
CUDA_LAUNCH((merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortToSort().toKernel()),prp...>),
ite.wthr,ite.thr,
v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortToSort().toKernel());
#endif
}
......@@ -1990,9 +1990,9 @@ public:
auto ite = v_pos.getGPUIteratorTo(g_m,n_thr);
merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortedToSorted().toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortedToSorted().toKernel());
CUDA_LAUNCH((merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortedToSorted().toKernel()),prp...>),
ite.wthr,ite.thr,
v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortedToSorted().toKernel());
#endif
}
......
......@@ -597,9 +597,9 @@ class vector_dist_comm
{
auto ite = g_pos_send.get(i).getGPUIterator();
process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>
<<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(), g_pos_send.get(i).toKernel(),
CUDA_LAUNCH((process_ghost_particles_pos<dim,decltype(g_opart_device.toKernel()),decltype(g_pos_send.get(i).toKernel()),decltype(v_pos.toKernel()),decltype(shifts.toKernel())>),
ite.wthr,ite.thr,
g_opart_device.toKernel(), g_pos_send.get(i).toKernel(),
v_pos.toKernel(),shifts.toKernel(),offset);
offset += prc_sz.get(i);
......@@ -824,9 +824,9 @@ class vector_dist_comm
{
auto ite = g_send_prp.get(i).getGPUIterator();
process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
CUDA_LAUNCH((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>),
ite.wthr,ite.thr,
g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
v_prp.toKernel(),offset);
offset += prc_sz.get(i);
......@@ -949,10 +949,10 @@ class vector_dist_comm
if (ite.wthr.x != 0)
{
// fill v_pos_tmp and v_prp_tmp with local particles
process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
ite.wthr,ite.thr,
m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
v_pos.toKernel(),v_prp.toKernel(),offset);
}
......@@ -967,10 +967,10 @@ class vector_dist_comm
if (ite.wthr.x != 0)
{
process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
CUDA_LAUNCH((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>),
ite.wthr,ite.thr,
m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
v_pos.toKernel(),v_prp.toKernel(),offset);
}
......@@ -1099,15 +1099,15 @@ class vector_dist_comm
for (size_t i = 0 ; i < dim ; i++) {bc.bc[i] = dec.periodicity(i);}
apply_bc_each_part<dim,St,decltype(v_pos.toKernel())><<<ite.wthr,ite.thr>>>(dec.getDomain(),bc,v_pos.toKernel());
CUDA_LAUNCH((apply_bc_each_part<dim,St,decltype(v_pos.toKernel())>),ite.wthr,ite.thr,dec.getDomain(),bc,v_pos.toKernel());
return;
}
// label particle processor
process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
CUDA_LAUNCH((process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>),
ite.wthr,ite.thr,
dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
#ifndef TEST1
......@@ -1145,7 +1145,7 @@ class vector_dist_comm
ite = lbl_p.getGPUIterator();
// we order lbl_p
reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())><<<ite.wthr,ite.thr>>>(lbl_p.toKernel(),starts.toKernel());
CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite.wthr,ite.thr,lbl_p.toKernel(),starts.toKernel());
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment