Commit 8a5e9361 authored by incardon's avatar incardon
Browse files

Fixing GPU local ghost

parent c5f9f46d
......@@ -79,6 +79,10 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
auto ite = v_pos.getGPUIterator();
// no work to do return
if (ite.wthr.x == 0)
{return;}
// First we have to see how many entry each particle produce
num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
......@@ -263,6 +267,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
// Label the internal (assigned) particles
ite = v_pos.getGPUIteratorTo(g_m);
// resize o_part_loc
o_part_loc.resize(total);
shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
decltype(starts.toKernel()),decltype(shifts.toKernel()),
......
......@@ -249,9 +249,17 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
for (unsigned int j = 0 ; j < dim ; j++)
{
v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
}
if (base_o + n < output.size())
{
output.template get<0>(base_o+n) = p;
output.template get<1>(base_o+n) = shift_actual;
}
else
{
printf("OVERFLOW \n");
}
v_prp.set(base+n,v_prp.get(p));
......
......@@ -948,12 +948,16 @@ class vector_dist_comm
starts.template deviceToHost<0>();
size_t offset = starts.template get<0>(rank);
// fill v_pos_tmp and v_prp_tmp with local particles
process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
// no work to do
if (ite.wthr.x != 0)
{
// fill v_pos_tmp and v_prp_tmp with local particles
process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
v_pos.toKernel(),v_prp.toKernel(),offset);
}
// Fill the sending buffers
for (size_t i = 0 ; i < m_pos.size() ; i++)
......@@ -962,11 +966,17 @@ class vector_dist_comm
auto ite = m_pos.get(i).getGPUIterator();
process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
// no work to do
if (ite.wthr.x != 0)
{
process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
v_pos.toKernel(),v_prp.toKernel(),offset);
}
}
// old local particles with the actual local particles
......@@ -1075,11 +1085,17 @@ class vector_dist_comm
prc_sz.template fill<0>(0);
auto ite = v_pos.getGPUIterator();
if (ite.wthr.x == 0)
{
starts.resize(v_cl.size());
starts.template fill<0>(0);
return;
}
// we have one process we can skip ...
if (v_cl.size() == 1)
{
auto ite = v_pos.getGPUIterator();
// ... but we have to apply the boundary conditions
periodicity_int<dim> bc;
......@@ -1091,8 +1107,6 @@ class vector_dist_comm
return;
}
auto ite = v_pos.getGPUIterator();
// label particle processor
process_id_proc_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())>
<<<ite.wthr,ite.thr>>>
......@@ -1220,10 +1234,6 @@ class vector_dist_comm
size_t & g_m,
size_t opt)
{
#ifdef EXTREA_TRACE_PRE_COMM
Extrae_user_function (1);
#endif
// Buffer that contain for each processor the id of the particle to send
prc_sz.clear();
g_opart.clear();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment