Commit e9155785 authored by incardon's avatar incardon
Browse files

Adding reduce funcionality + fixing bug of missing __host__ in copy_fusion_vector

parent 1c8373d8
openfpm_data @ e7010bdc
Subproject commit 6853316cab36d72c3fe4fabbec2d31cd5697c531
Subproject commit e7010bdc474736c4211947ad8f08dd29d6e4e59f
......@@ -888,7 +888,5 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
}
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -9,6 +9,7 @@
#define VECTOR_DIST_CUDA_FUNCS_CUH_
#include "Vector/util/vector_dist_funcs.hpp"
#include "util/cuda/moderngpu/kernel_reduce.hxx"
template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
__global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
......@@ -200,4 +201,21 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
}
}
template<unsigned int prp, typename vector_type>
auto reduce(vector_type & vd) -> typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
{
typedef typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type reduce_type;
CudaMemory mem;
mem.allocate(sizeof(reduce_type));
mgpu::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(),
vd.size_local(), (reduce_type *)mem.getDevicePointer() ,
mgpu::plus_t<reduce_type>(), vd.getVC().getmgpuContext());
mem.deviceToHost();
return *(reduce_type *)(mem.getPointer());
}
#endif /* VECTOR_DIST_CUDA_FUNCS_CUH_ */
......@@ -378,12 +378,11 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
auto NN = vd.getCellListGPU(0.1);
auto NN_cpu = vd.getCellList(0.1);
check_cell_list_cpu_and_gpu(vd,NN,NN_cpu);
auto NN_up = vd.getCellListGPU(0.1);
NN_up.clear();
vd.updateCellList(NN_up);
check_cell_list_cpu_and_gpu(vd,NN,NN_cpu);
check_cell_list_cpu_and_gpu(vd,NN_up,NN_cpu);
// We check if we opotain the same result from updateCellList
......@@ -680,8 +679,11 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
{
vdist_calc_gpu_test<float>();
vdist_calc_gpu_test<double>();
}
/* auto & v_cl = create_vcluster();
BOOST_AUTO_TEST_CASE(vector_dist_reduce)
{
auto & v_cl = create_vcluster();
if (v_cl.size() > 16)
{return;}
......@@ -694,260 +696,43 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
// Boundary conditions
size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(1000,domain,bc,g);
vector_dist_gpu<3,float,aggregate<float,double,int,size_t>> vd(5000*v_cl.size(),domain,bc,g);
auto it = vd.getDomainIterator();
while (it.isNext())
float fc = 1.0;
double dc = 1.0;
int ic = 1.0;
size_t sc = 1.0;
while(it.isNext())
{
auto p = it.get();
vd.getPos(p)[0] = (float)rand() / RAND_MAX;
vd.getPos(p)[1] = (float)rand() / RAND_MAX;
vd.getPos(p)[2] = (float)rand() / RAND_MAX;
vd.template getProp<0>(p) = fc;
vd.template getProp<1>(p) = dc;
vd.template getProp<2>(p) = ic;
vd.template getProp<3>(p) = sc;
vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
vd.template getProp<1>(p)[2] = vd.getPos(p)[2];
vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
fc += 1.0;
dc += 1.0;
ic += 1;
sc += 1;
++it;
}
// move on device
vd.hostToDevicePos();
vd.hostToDeviceProp<0,1,2>();
// Ok we redistribute the particles (GPU based)
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
vd.write("write_start");
// Reset the host part
auto it3 = vd.getDomainIterator();
while (it3.isNext())
{
auto p = it3.get();
vd.getPos(p)[0] = 1.0;
vd.getPos(p)[1] = 1.0;
vd.getPos(p)[2] = 1.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
++it3;
}
// we move from Device to CPU
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
// Check
auto it2 = vd.getDomainIterator();
bool match = true;
while (it2.isNext())
{
auto p = it2.get();
match &= vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
match &= vd.template getProp<1>(p)[0] == vd.getPos(p)[0];
match &= vd.template getProp<1>(p)[1] == vd.getPos(p)[1];
match &= vd.template getProp<1>(p)[2] == vd.getPos(p)[2];
match &= vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1];
match &= vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2];
match &= vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2];
++it2;
}
BOOST_REQUIRE_EQUAL(match,true);
// count local particles
size_t l_cnt = 0;
size_t nl_cnt = 0;
size_t n_out = 0;
// Domain + ghost box
Box<3,float> dom_ext = domain;
dom_ext.enlarge(g);
auto it5 = vd.getDomainIterator();
count_local_n_local<3>(vd,it5,bc,domain,dom_ext,l_cnt,nl_cnt,n_out);
BOOST_REQUIRE_EQUAL(n_out,0);
BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
// we do 10 gpu steps (using a cpu vector to check that map and ghost get work as expented)
for (size_t i = 0 ; i < 10 ; i++)
{
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
vd.write_frame("write_ggg",i);
// To test we copy on a cpu distributed vector and we do a map
vector_dist<3,float,aggregate<float,float[3],float[3]>> vd_cpu(vd.getDecomposition().duplicate_convert<HeapMemory,memory_traits_lin>(),0);
auto itc = vd.getDomainIterator();
while (itc.isNext())
{
auto p = itc.get();
vd_cpu.add();
vd_cpu.getLastPos()[0] = vd.getPos(p)[0];
vd_cpu.getLastPos()[1] = vd.getPos(p)[1];
vd_cpu.getLastPos()[2] = vd.getPos(p)[2];
vd_cpu.getLastProp<0>() = vd.getProp<0>(p);
vd_cpu.getLastProp<1>()[0] = vd.getProp<1>(p)[0];
vd_cpu.getLastProp<1>()[1] = vd.getProp<1>(p)[1];
vd_cpu.getLastProp<1>()[2] = vd.getProp<1>(p)[2];
vd_cpu.getLastProp<2>()[0] = vd.getProp<2>(p)[0];
vd_cpu.getLastProp<2>()[1] = vd.getProp<2>(p)[1];
vd_cpu.getLastProp<2>()[2] = vd.getProp<2>(p)[2];
++itc;
}
vd_cpu.ghost_get<0,1,2>();
vd.ghost_get<0,1,2>(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
match = true;
// Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder
struct part
{
Point<3,float> xp;
float prp0;
float prp1[3];
float prp2[3];
bool operator<(const part & tmp) const
{
if (xp.get(0) < tmp.xp.get(0))
{return true;}
else if (xp.get(0) > tmp.xp.get(0))
{return false;}
if (xp.get(1) < tmp.xp.get(1))
{return true;}
else if (xp.get(1) > tmp.xp.get(1))
{return false;}
if (xp.get(2) < tmp.xp.get(2))
{return true;}
else if (xp.get(2) > tmp.xp.get(2))
{return false;}
return false;
}
};
openfpm::vector<part> cpu_sort;
openfpm::vector<part> gpu_sort;
cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local());
size_t cnt = 0;
auto itc2 = vd.getGhostIterator();
while (itc2.isNext())
{
auto p = itc2.get();
cpu_sort.get(cnt).xp.get(0) = vd_cpu.getPos(p)[0];
gpu_sort.get(cnt).xp.get(0) = vd.getPos(p)[0];
cpu_sort.get(cnt).xp.get(1) = vd_cpu.getPos(p)[1];
gpu_sort.get(cnt).xp.get(1) = vd.getPos(p)[1];
cpu_sort.get(cnt).xp.get(2) = vd_cpu.getPos(p)[2];
gpu_sort.get(cnt).xp.get(2) = vd.getPos(p)[2];
cpu_sort.get(cnt).prp0 = vd_cpu.getProp<0>(p);
gpu_sort.get(cnt).prp0 = vd.getProp<0>(p);
cpu_sort.get(cnt).prp1[0] = vd_cpu.getProp<1>(p)[0];
gpu_sort.get(cnt).prp1[0] = vd.getProp<1>(p)[0];
cpu_sort.get(cnt).prp1[1] = vd_cpu.getProp<1>(p)[1];
gpu_sort.get(cnt).prp1[1] = vd.getProp<1>(p)[1];
cpu_sort.get(cnt).prp1[2] = vd_cpu.getProp<1>(p)[2];
gpu_sort.get(cnt).prp1[2] = vd.getProp<1>(p)[2];
cpu_sort.get(cnt).prp2[0] = vd_cpu.getProp<2>(p)[0];
gpu_sort.get(cnt).prp2[0] = vd.getProp<2>(p)[0];
cpu_sort.get(cnt).prp2[1] = vd_cpu.getProp<2>(p)[1];
gpu_sort.get(cnt).prp2[1] = vd.getProp<2>(p)[1];
cpu_sort.get(cnt).prp2[2] = vd_cpu.getProp<2>(p)[2];
gpu_sort.get(cnt).prp2[2] = vd.getProp<2>(p)[2];
++cnt;
++itc2;
}
cpu_sort.sort();
gpu_sort.sort();
vd.template hostToDeviceProp<0,1,2,3>();
for (size_t i = 0 ; i < cpu_sort.size() ; i++)
{
match &= cpu_sort.get(i).xp.get(0) == gpu_sort.get(i).xp.get(0);
match &= cpu_sort.get(i).xp.get(1) == gpu_sort.get(i).xp.get(1);
match &= cpu_sort.get(i).xp.get(2) == gpu_sort.get(i).xp.get(2);
float redf = reduce<0>(vd);
double redd = reduce<1>(vd);
int redi = reduce<2>(vd);
size_t reds = reduce<3>(vd);
match &= cpu_sort.get(i).prp0 == gpu_sort.get(i).prp0;
match &= cpu_sort.get(i).prp1[0] == gpu_sort.get(i).prp1[0];
match &= cpu_sort.get(i).prp1[1] == gpu_sort.get(i).prp1[1];
match &= cpu_sort.get(i).prp1[2] == gpu_sort.get(i).prp1[2];
match &= cpu_sort.get(i).prp2[0] == gpu_sort.get(i).prp2[0];
match &= cpu_sort.get(i).prp2[1] == gpu_sort.get(i).prp2[1];
match &= cpu_sort.get(i).prp2[2] == gpu_sort.get(i).prp2[2];
}
BOOST_REQUIRE_EQUAL(match,true);
// move particles on gpu
auto ite = vd.getDomainIteratorGPU();
move_parts_gpu_test<3,decltype(vd.toKernel())><<<ite.wthr,ite.thr>>>(vd.toKernel());
}*/
BOOST_REQUIRE_EQUAL(redf,(vd.size_local()+1.0)*(vd.size_local())/2.0);
BOOST_REQUIRE_EQUAL(redd,(vd.size_local()+1.0)*(vd.size_local())/2.0);
BOOST_REQUIRE_EQUAL(redi,(vd.size_local()+1)*(vd.size_local())/2);
BOOST_REQUIRE_EQUAL(reds,(vd.size_local()+1)*(vd.size_local())/2);
}
......
......@@ -1190,6 +1190,8 @@ public:
cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out);
cell_list.set_ndec(getDecomposition().get_ndec());
return cell_list;
}
......@@ -2288,7 +2290,7 @@ public:
*
*/
Vcluster<> & getVC()
Vcluster<Memory> & getVC()
{
#ifdef SE_CLASS2
check_valid(this,8);
......@@ -2301,7 +2303,7 @@ public:
* \return the particle position vector
*
*/
const openfpm::vector<Point<dim,St>> & getPosVector() const
const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVector() const
{
return v_pos;
}
......@@ -2311,7 +2313,7 @@ public:
* \return the particle position vector
*
*/
openfpm::vector<Point<dim,St>> & getPosVector()
openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVector()
{
return v_pos;
}
......@@ -2321,7 +2323,7 @@ public:
* \return the particle property vector
*
*/
const openfpm::vector<prop> & getPropVector() const
const openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVector() const
{
return v_prp;
}
......@@ -2331,7 +2333,7 @@ public:
* \return the particle property vector
*
*/
openfpm::vector<prop> & getPropVector()
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVector()
{
return v_prp;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment