Commit 1c8373d8 authored by incardon's avatar incardon

Testing update cell-list

parent 92b5df78
openfpm_data @ 6853316c
Subproject commit 37d1890f1c6953c2b1212ce937a86721ba6bb6c9
Subproject commit 6853316cab36d72c3fe4fabbec2d31cd5697c531
......@@ -914,7 +914,7 @@ public:
cart.bbox = bbox;
for (size_t i = 0 ; i < dim ; i++)
cart.bc[i] = this->bc[i];
{cart.bc[i] = this->bc[i];}
return cart;
}
......
......@@ -109,6 +109,37 @@ public:
return processorID_impl(pt,clk,sub_domains_global);
}
/*! \brief Apply boundary condition to the point
*
* If the particle go out to the right, bring back the particle on the left
* in case of periodic, nothing in case of non periodic
*
* \param pt encapsulated point object (it's coordinated are changed according the
* the explanation before)
*
*/
template<typename Mem> __device__ void applyPointBC(encapc<1,Point<dim,T>,Mem> && pt) const
{
for (size_t i = 0 ; i < dim ; i++)
{
if (bc[i] == PERIODIC)
{pt.template get<0>()[i] = openfpm::math::periodic_l(pt.template get<0>()[i],domain.getHigh(i),domain.getLow(i));}
}
}
/*! \brief Given a point return in which processor the particle should go
*
* \param p point
*
* \return processorID
*
*/
__device__ int inline processorID(const Point<dim,T> &pt)
{
return processorID_impl(pt,clk,sub_domains_global);
}
};
#endif /* CARTDECOMPOSITION_GPU_HPP_ */
#ifndef GRID_DIST_KEY_DX_HPP
#define GRID_DIST_KEY_DX_HPP
#include "Grid/map_grid.hpp"
/*! \brief Grid key for a distributed grid
*
* It contain from which local sub-domain grid come from, and the local grid_key_dx
......
......@@ -62,7 +62,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
auto ite = v_pos.getGPUIterator();
// First we have to see how many entry each particle produce
num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
num_proc_ghost_each_part<3,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
......@@ -84,7 +84,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
ite = v_pos.getGPUIterator();
// we compute processor id for each particle
proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
proc_label_id_ghost<3,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
......
......@@ -730,7 +730,7 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
openfpm::vector_gpu<aggregate<int,int>> proc_id_out;
proc_id_out.resize(vg.size());
process_id_proc_each_part<decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),v_cl.rank());
......
......@@ -36,16 +36,17 @@ __global__ void num_proc_ghost_each_part(decomposition_type dec, vector_type vd,
out.template get<0>(p) = dec.ghost_processorID_N(xp);
}
template<typename cartdec_gpu, typename particles_type, typename vector_out>
template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out>
__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output , int rank)
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= parts.size()) return;
Point<3,float> xp = parts.template get<0>(p);
cdg.applyPointBC(parts.get(p));
Point<dim,St> xp = parts.template get<0>(p);
int pr = cdg.processorIDBC(xp);
int pr = cdg.processorID(xp);
output.template get<1>(p) = (pr == rank)?-1:pr;
output.template get<0>(p) = p;
......
......@@ -7,6 +7,18 @@
#define SUB_UNIT_FACTOR 1024
template<unsigned int dim , typename vector_dist_type>
__global__ void move_parts_gpu_test(vector_dist_type vd)
{
auto p = GET_PARTICLE(vd);
#pragma unroll
for (int i = 0 ; i < dim ; i++)
{
vd.getPos(p)[i] += 0.05;
}
}
BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
void print_test(std::string test, size_t sz)
......@@ -242,6 +254,28 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_ghost_get )
BOOST_REQUIRE(tot_s > 1000);
}
template<typename vector_type, typename CellList_type, typename CellList_type_cpu>
void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_type_cpu & NN_cpu)
{
auto it5 = vd.getDomainIteratorGPU();
calculate_force<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel(),vd.toKernel_sorted(),NN.toKernel());
vd.template deviceToHostProp<1,2>();
bool test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
// We do exactly the same test as before, but now we completely use the sorted version
calculate_force_full_sort<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel_sorted(),NN.toKernel());
vd.template deviceToHostProp<1>();
test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
}
BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
{
auto & v_cl = create_vcluster();
......@@ -345,23 +379,16 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
auto NN = vd.getCellListGPU(0.1);
auto NN_cpu = vd.getCellList(0.1);
auto it5 = vd.getDomainIteratorGPU();
auto NN_up = vd.getCellListGPU(0.1);
NN_up.clear();
vd.updateCellList(NN_up);
calculate_force<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel(),vd.toKernel_sorted(),NN.toKernel());
check_cell_list_cpu_and_gpu(vd,NN,NN_cpu);
check_cell_list_cpu_and_gpu(vd,NN_up,NN_cpu);
vd.template deviceToHostProp<1,2>();
bool test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
// We do exactly the same test as before, but now we completely use the sorted version
calculate_force_full_sort<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel_sorted(),NN.toKernel());
// We check if we opotain the same result from updateCellList
vd.template deviceToHostProp<1>();
test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
// check
......@@ -377,13 +404,288 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
}
BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
template<typename St>
void vdist_calc_gpu_test()
{
auto & v_cl = create_vcluster();
if (v_cl.size() > 16)
{return;}
Box<3,St> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
// set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
Ghost<3,St> g(0.1);
// Boundary conditions
size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
vector_dist_gpu<3,St,aggregate<float,float[3],float[3]>> vd(1000,domain,bc,g);
auto it = vd.getDomainIterator();
while (it.isNext())
{
auto p = it.get();
vd.getPos(p)[0] = (St)rand() / RAND_MAX;
vd.getPos(p)[1] = (St)rand() / RAND_MAX;
vd.getPos(p)[2] = (St)rand() / RAND_MAX;
vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
vd.template getProp<1>(p)[2] = vd.getPos(p)[2];
vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
++it;
}
// move on device
vd.hostToDevicePos();
vd.template hostToDeviceProp<0,1,2>();
// Ok we redistribute the particles (GPU based)
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
vd.write("write_start");
// Reset the host part
auto it3 = vd.getDomainIterator();
while (it3.isNext())
{
auto p = it3.get();
vd.getPos(p)[0] = 1.0;
vd.getPos(p)[1] = 1.0;
vd.getPos(p)[2] = 1.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
vd.template getProp<0>(p) = 0.0;
++it3;
}
// we move from Device to CPU
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
// Check
auto it2 = vd.getDomainIterator();
bool match = true;
while (it2.isNext())
{
auto p = it2.get();
match &= vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
match &= vd.template getProp<1>(p)[0] == vd.getPos(p)[0];
match &= vd.template getProp<1>(p)[1] == vd.getPos(p)[1];
match &= vd.template getProp<1>(p)[2] == vd.getPos(p)[2];
match &= vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1];
match &= vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2];
match &= vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2];
++it2;
}
BOOST_REQUIRE_EQUAL(match,true);
// count local particles
size_t l_cnt = 0;
size_t nl_cnt = 0;
size_t n_out = 0;
// Domain + ghost box
Box<3,St> dom_ext = domain;
dom_ext.enlarge(g);
auto it5 = vd.getDomainIterator();
count_local_n_local<3>(vd,it5,bc,domain,dom_ext,l_cnt,nl_cnt,n_out);
BOOST_REQUIRE_EQUAL(n_out,0);
BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
// we do 10 gpu steps (using a cpu vector to check that map and ghost get work as expented)
for (size_t i = 0 ; i < 10 ; i++)
{
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
vd.write_frame("write_ggg",i);
// To test we copy on a cpu distributed vector and we do a map
vector_dist<3,St,aggregate<float,float[3],float[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
auto itc = vd.getDomainIterator();
while (itc.isNext())
{
auto p = itc.get();
vd_cpu.add();
vd_cpu.getLastPos()[0] = vd.getPos(p)[0];
vd_cpu.getLastPos()[1] = vd.getPos(p)[1];
vd_cpu.getLastPos()[2] = vd.getPos(p)[2];
vd_cpu.template getLastProp<0>() = vd.template getProp<0>(p);
vd_cpu.template getLastProp<1>()[0] = vd.template getProp<1>(p)[0];
vd_cpu.template getLastProp<1>()[1] = vd.template getProp<1>(p)[1];
vd_cpu.template getLastProp<1>()[2] = vd.template getProp<1>(p)[2];
vd_cpu.template getLastProp<2>()[0] = vd.template getProp<2>(p)[0];
vd_cpu.template getLastProp<2>()[1] = vd.template getProp<2>(p)[1];
vd_cpu.template getLastProp<2>()[2] = vd.template getProp<2>(p)[2];
++itc;
}
vd_cpu.template ghost_get<0,1,2>();
vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
match = true;
// Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder
struct part
{
Point<3,St> xp;
float prp0;
float prp1[3];
float prp2[3];
bool operator<(const part & tmp) const
{
if (xp.get(0) < tmp.xp.get(0))
{return true;}
else if (xp.get(0) > tmp.xp.get(0))
{return false;}
if (xp.get(1) < tmp.xp.get(1))
{return true;}
else if (xp.get(1) > tmp.xp.get(1))
{return false;}
if (xp.get(2) < tmp.xp.get(2))
{return true;}
else if (xp.get(2) > tmp.xp.get(2))
{return false;}
return false;
}
};
openfpm::vector<part> cpu_sort;
openfpm::vector<part> gpu_sort;
cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local());
size_t cnt = 0;
auto itc2 = vd.getGhostIterator();
while (itc2.isNext())
{
auto p = itc2.get();
cpu_sort.get(cnt).xp.get(0) = vd_cpu.getPos(p)[0];
gpu_sort.get(cnt).xp.get(0) = vd.getPos(p)[0];
cpu_sort.get(cnt).xp.get(1) = vd_cpu.getPos(p)[1];
gpu_sort.get(cnt).xp.get(1) = vd.getPos(p)[1];
cpu_sort.get(cnt).xp.get(2) = vd_cpu.getPos(p)[2];
gpu_sort.get(cnt).xp.get(2) = vd.getPos(p)[2];
cpu_sort.get(cnt).prp0 = vd_cpu.template getProp<0>(p);
gpu_sort.get(cnt).prp0 = vd.template getProp<0>(p);
cpu_sort.get(cnt).prp1[0] = vd_cpu.template getProp<1>(p)[0];
gpu_sort.get(cnt).prp1[0] = vd.template getProp<1>(p)[0];
cpu_sort.get(cnt).prp1[1] = vd_cpu.template getProp<1>(p)[1];
gpu_sort.get(cnt).prp1[1] = vd.template getProp<1>(p)[1];
cpu_sort.get(cnt).prp1[2] = vd_cpu.template getProp<1>(p)[2];
gpu_sort.get(cnt).prp1[2] = vd.template getProp<1>(p)[2];
cpu_sort.get(cnt).prp2[0] = vd_cpu.template getProp<2>(p)[0];
gpu_sort.get(cnt).prp2[0] = vd.template getProp<2>(p)[0];
cpu_sort.get(cnt).prp2[1] = vd_cpu.template getProp<2>(p)[1];
gpu_sort.get(cnt).prp2[1] = vd.template getProp<2>(p)[1];
cpu_sort.get(cnt).prp2[2] = vd_cpu.template getProp<2>(p)[2];
gpu_sort.get(cnt).prp2[2] = vd.template getProp<2>(p)[2];
++cnt;
++itc2;
}
cpu_sort.sort();
gpu_sort.sort();
for (size_t i = 0 ; i < cpu_sort.size() ; i++)
{
match &= cpu_sort.get(i).xp.get(0) == gpu_sort.get(i).xp.get(0);
match &= cpu_sort.get(i).xp.get(1) == gpu_sort.get(i).xp.get(1);
match &= cpu_sort.get(i).xp.get(2) == gpu_sort.get(i).xp.get(2);
match &= cpu_sort.get(i).prp0 == gpu_sort.get(i).prp0;
match &= cpu_sort.get(i).prp1[0] == gpu_sort.get(i).prp1[0];
match &= cpu_sort.get(i).prp1[1] == gpu_sort.get(i).prp1[1];
match &= cpu_sort.get(i).prp1[2] == gpu_sort.get(i).prp1[2];
match &= cpu_sort.get(i).prp2[0] == gpu_sort.get(i).prp2[0];
match &= cpu_sort.get(i).prp2[1] == gpu_sort.get(i).prp2[1];
match &= cpu_sort.get(i).prp2[2] == gpu_sort.get(i).prp2[2];
}
BOOST_REQUIRE_EQUAL(match,true);
// move particles on gpu
auto ite = vd.getDomainIteratorGPU();
move_parts_gpu_test<3,decltype(vd.toKernel())><<<ite.wthr,ite.thr>>>(vd.toKernel());
}
}
BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
{
vdist_calc_gpu_test<float>();
vdist_calc_gpu_test<double>();
/* auto & v_cl = create_vcluster();
if (v_cl.size() > 16)
{return;}
Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
// set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
......@@ -424,6 +726,11 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
// Ok we redistribute the particles (GPU based)
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
vd.write("write_start");
// Reset the host part
auto it3 = vd.getDomainIterator();
......@@ -494,143 +801,153 @@ BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
BOOST_REQUIRE_EQUAL(n_out,0);
BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
// we do 10 gpu steps (using a cpu vector to check that map and ghost get work as expented)
// To test we copy on a cpu distributed vector and we do a map
for (size_t i = 0 ; i < 10 ; i++)
{
vd.map(RUN_ON_DEVICE);
vector_dist<3,float,aggregate<float,float[3],float[3]>> vd_cpu(vd.getDecomposition().duplicate_convert<HeapMemory,memory_traits_lin>(),0);
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
auto itc = vd.getDomainIterator();
vd.write_frame("write_ggg",i);
while (itc.isNext())
{
auto p = itc.get();
vd_cpu.add();
// To test we copy on a cpu distributed vector and we do a map
vd_cpu.getLastPos()[0] = vd.getPos(p)[0];
vd_cpu.getLastPos()[1] = vd.getPos(p)[1];
vd_cpu.getLastPos()[2] = vd.getPos(p)[2];
vector_dist<3,float,aggregate<float,float[3],float[3]>> vd_cpu(vd.getDecomposition().duplicate_convert<HeapMemory,memory_traits_lin>(),0);
vd_cpu.getLastProp<0>() = vd.getProp<0>(p);
auto itc = vd.getDomainIterator();
vd_cpu.getLastProp<1>()[0] = vd.getProp<1>(p)[0];
vd_cpu.getLastProp<1>()[1] = vd.getProp<1>(p)[1];
vd_cpu.getLastProp<1>()[2] = vd.getProp<1>(p)[2];
while (itc.isNext())
{
auto p = itc.get();
vd_cpu.getLastProp<2>()[0] = vd.getProp<2>(p)[0];
vd_cpu.getLastProp<2>()[1] = vd.getProp<2>(p)[1];
vd_cpu.getLastProp<2>()[2] = vd.getProp<2>(p)[2];
vd_cpu.add();
++itc;
}
vd_cpu.getLastPos()[0] = vd.getPos(p)[0];
vd_cpu.getLastPos()[1] = vd.getPos(p)[1];
vd_cpu.getLastPos()[2] = vd.getPos(p)[2];
vd_cpu.ghost_get<0,1,2>();
vd.ghost_get<0,1,2>(RUN_ON_DEVICE);
vd_cpu.getLastProp<0>() = vd.getProp<0>(p);
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
vd_cpu.getLastProp<1>()[0] = vd.getProp<1>(p)[0];
vd_cpu.getLastProp<1>()[1] = vd.getProp<1>(p)[1];
vd_cpu.getLastProp<1>()[2] = vd.getProp<1>(p)[2];
vd.write("write_test");
vd_cpu.write("write_test2");
vd_cpu.getLastProp<2>()[0] = vd.getProp<2>(p)[0];
vd_cpu.getLastProp<2>()[1] = vd.getProp<2>(p)[1];
vd_cpu.getLastProp<2>()[2] = vd.getProp<2>(p)[2];
match = true;
++itc;
}
// Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder
vd_cpu.ghost_get<0,1,2>();
vd.ghost_get<0,1,2>(RUN_ON_DEVICE);
struct part
{
Point<3,float> xp;
vd.deviceToHostPos();
vd.deviceToHostProp<0,1,2>();
float prp0;
float prp1[3];
float prp2[3];
match = true;
bool operator<(const part & tmp) const
// Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder