Commit 88c20d93 authored by incardon's avatar incardon

Sorted force now working

parent 0f7b39e1
......@@ -80,7 +80,7 @@ __global__ void calc_force_gpu(vector_dist_type vd, NN_type NN, real_number sigm
while (Np.isNext())
{
// ... q
auto q = Np.get();
auto q = Np.get_sort();
// if (p == q) skip this particle
if (q == p) {++Np; continue;};
......@@ -225,7 +225,7 @@ template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggr
// Get an iterator over particles
auto it2 = vd.getDomainIteratorGPU();
calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2);
calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2);
//! \cond [force calc] \endcond
......
openfpm_data @ 7ae9fb58
Subproject commit 7eb370cf88baa62a2c5ce0678edccc57da543392
Subproject commit 7ae9fb5851c9065623ee28628b8f8729332340cc
......@@ -822,6 +822,10 @@ public:
openfpm::vector<device_grid> & loc_grid,
std::unordered_map<size_t,size_t> & g_id_to_external_ghost_box)
{
#ifdef PROFILE_SCOREP
SCOREP_USER_REGION("ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
#endif
size_t req = 0;
ExtPreAlloc<Memory> * prRecv_prp = NULL;
......
......@@ -833,6 +833,146 @@ BOOST_AUTO_TEST_CASE(vector_dist_reorder_lbl)
}
}
BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
{
openfpm::vector_gpu<aggregate<float[3],float[3],float[3]>> v_prp;
openfpm::vector_gpu<Point<3,float>> v_pos;
openfpm::vector_gpu<aggregate<float[3],float[3],float[3]>> v_prp_out;
openfpm::vector_gpu<Point<3,float>> v_pos_out;
openfpm::vector_gpu<aggregate<int>> ns_to_s;
v_prp.resize(10000);
v_pos.resize(10000);
v_prp_out.resize(10000);
v_pos_out.resize(10000);
ns_to_s.resize(10000);
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
v_pos.template get<0>(i)[0] = i;
v_pos.template get<0>(i)[1] = i+10000;
v_pos.template get<0>(i)[2] = i+20000;
v_prp.template get<0>(i)[0] = i+60123;
v_prp.template get<0>(i)[1] = i+73543;
v_prp.template get<0>(i)[2] = i+82432;
v_prp.template get<1>(i)[0] = i+80123;
v_prp.template get<1>(i)[1] = i+93543;
v_prp.template get<1>(i)[2] = i+102432;
v_prp.template get<2>(i)[0] = i+110123;
v_prp.template get<2>(i)[1] = i+123543;
v_prp.template get<2>(i)[2] = i+132432;
v_prp_out.template get<0>(i)[0] = 0;
v_prp_out.template get<0>(i)[1] = 0;
v_prp_out.template get<0>(i)[2] = 0;
v_prp_out.template get<1>(i)[0] = 0;
v_prp_out.template get<1>(i)[1] = 0;
v_prp_out.template get<1>(i)[2] = 0;
v_prp_out.template get<2>(i)[0] = 0;
v_prp_out.template get<2>(i)[1] = 0;
v_prp_out.template get<2>(i)[2] = 0;
ns_to_s.template get<0>(i) = 10000-i-1;
}
v_prp.template hostToDevice<0,1,2>();
v_prp_out.template hostToDevice<0,1,2>();
v_pos.template hostToDevice<0>();
v_pos_out.template hostToDevice<0>();
ns_to_s.template hostToDevice<0>();
auto ite = v_pos.getGPUIterator();
merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
v_prp_out.template deviceToHost<0,1,2>();
bool match = true;
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
match &= v_prp_out.template get<0>(10000-i-1)[0] == v_prp.template get<0>(i)[0];
match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
match &= v_prp_out.template get<1>(10000-i-1)[0] == 0;
match &= v_prp_out.template get<1>(10000-i-1)[1] == 0;
match &= v_prp_out.template get<1>(10000-i-1)[2] == 0;
match &= v_prp_out.template get<2>(10000-i-1)[0] == 0;
match &= v_prp_out.template get<2>(10000-i-1)[1] == 0;
match &= v_prp_out.template get<2>(10000-i-1)[2] == 0;
}
BOOST_REQUIRE_EQUAL(match,true);
merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
v_prp_out.template deviceToHost<0,1,2>();
v_pos_out.template deviceToHost<0>();
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
match &= v_prp_out.template get<0>(10000-i-1)[0] == v_prp.template get<0>(i)[0];
match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
match &= v_prp_out.template get<1>(10000-i-1)[0] == v_prp.template get<1>(i)[0];
match &= v_prp_out.template get<1>(10000-i-1)[1] == v_prp.template get<1>(i)[1];
match &= v_prp_out.template get<1>(10000-i-1)[2] == v_prp.template get<1>(i)[2];
match &= v_prp_out.template get<2>(10000-i-1)[0] == v_prp.template get<2>(i)[0];
match &= v_prp_out.template get<2>(10000-i-1)[1] == v_prp.template get<2>(i)[1];
match &= v_prp_out.template get<2>(10000-i-1)[2] == v_prp.template get<2>(i)[2];
match &= v_pos_out.template get<0>(10000-i-1)[0] == 0;
match &= v_pos_out.template get<0>(10000-i-1)[1] == 0;
match &= v_pos_out.template get<0>(10000-i-1)[2] == 0;
}
BOOST_REQUIRE_EQUAL(match,true);
merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
v_prp_out.template deviceToHost<0,1,2>();
v_pos_out.template deviceToHost<0>();
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
match &= v_prp_out.template get<0>(10000-i-1)[0] == v_prp.template get<0>(i)[0];
match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
match &= v_prp_out.template get<1>(10000-i-1)[0] == v_prp.template get<1>(i)[0];
match &= v_prp_out.template get<1>(10000-i-1)[1] == v_prp.template get<1>(i)[1];
match &= v_prp_out.template get<1>(10000-i-1)[2] == v_prp.template get<1>(i)[2];
match &= v_prp_out.template get<2>(10000-i-1)[0] == v_prp.template get<2>(i)[0];
match &= v_prp_out.template get<2>(10000-i-1)[1] == v_prp.template get<2>(i)[1];
match &= v_prp_out.template get<2>(10000-i-1)[2] == v_prp.template get<2>(i)[2];
match &= v_pos_out.template get<0>(10000-i-1)[0] == v_pos.template get<0>(i)[0];
match &= v_pos_out.template get<0>(10000-i-1)[1] == v_pos.template get<0>(i)[1];
match &= v_pos_out.template get<0>(10000-i-1)[2] == v_pos.template get<0>(i)[2];
}
BOOST_REQUIRE_EQUAL(match,true);
}
BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
{
openfpm::vector_gpu<aggregate<int,int>> m_opart;
......
......@@ -48,7 +48,7 @@ __global__ void apply_bc_each_part(Box<dim,St> domain, periodicity_int<dim> bc,
applyPointBC_no_dec(domain,bc,parts.get(p));
}
template<typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
template<bool merge_pos, typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
__global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
stns_type nss)
......@@ -57,6 +57,11 @@ __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
if (p >= vd_pos.size()) return;
if (merge_pos == true)
{
vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
}
vd_prp.template set<prp...>(p,vd_prp_ord,nss.template get<0>(p));
}
......
......@@ -90,9 +90,11 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
template<typename CellList_type>
__global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<float, float[3], float [3]>> vd,
CellList_type cl)
CellList_type cl, int rank)
{
auto p = GET_PARTICLE(vd);
auto p = GET_PARTICLE_SORT(cl);
unsigned int ns_id = cl.getSortToNonSort().template get<0>(p);
Point<3,float> xp = vd.getPos(p);
......@@ -266,16 +268,32 @@ void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_
bool test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
// We reset the property 1 on device
auto rst = vd.getDomainIterator();
while (rst.isNext())
{
auto p = rst.get();
vd.template getProp<1>(p)[0] = 0.0;
vd.template getProp<1>(p)[1] = 0.0;
vd.template getProp<1>(p)[2] = 0.0;
++rst;
}
vd.template hostToDeviceProp<1>();
// We do exactly the same test as before, but now we completely use the sorted version
calculate_force_full_sort<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel_sorted(),NN.toKernel());
calculate_force_full_sort<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
vd.template merge_sort<1>(NN);
vd.template deviceToHostProp<1>();
test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
vd.template merge_sort<1>(NN);
}
BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
......
......@@ -1188,7 +1188,7 @@ public:
v_prp_out.resize(v_pos.size());
v_pos_out.resize(v_pos.size());
cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out);
cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m);
cell_list.set_ndec(getDecomposition().get_ndec());
......@@ -1251,7 +1251,7 @@ public:
if (to_reconstruct == false)
{
populate_cell_list(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,g_m,CL_NON_SYMMETRIC);
populate_cell_list(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,v_cl.getmgpuContext(),g_m,CL_NON_SYMMETRIC);
cell_list.set_gm(g_m);
}
......@@ -1283,7 +1283,7 @@ public:
if (to_reconstruct == false)
{
populate_cell_list(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,g_m,CL_SYMMETRIC);
populate_cell_list(v_pos,v_pos_out,v_prp,v_prp_out,cell_list,v_cl.getmgpuContext(),g_m,CL_SYMMETRIC);
cell_list.set_gm(g_m);
}
......@@ -1854,7 +1854,90 @@ public:
auto ite = v_pos.getGPUIteratorTo(g_m,n_thr);
merge_sort_part<decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortedToSorted().toKernel()),prp...>
merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortToSort().toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortToSort().toKernel());
#endif
}
/*! \brief print a vector type property
*
* \param print_sorted (Print the sorted version)
*
* \tparam property
*
*/
template<unsigned int prp>
void debugPrintVector(bool print_sorted = false)
{
if (print_sorted == false)
{this->v_prp.template deviceToHost<prp>();}
else
{this->v_prp_out.template deviceToHost<prp>();}
auto it = this->getDomainIterator();
while(it.isNext())
{
auto p = it.get();
for (size_t i = 0 ; i < std::extent<typename boost::mpl::at<typename prop::type,boost::mpl::int_<prp>>::type>::value ; i++)
{
if (print_sorted == false)
{std::cout << v_prp.template get<prp>(p.getKey())[i] << " ";}
else
{std::cout << v_prp_out.template get<prp>(p.getKey())[i] << " ";}
}
std::cout << std::endl;
++it;
}
}
/*! \brief print a scalar type property
*
* \param print_sorted (Print the sorted version)
*
* \tparam property
*
*/
template<unsigned int prp>
void debugPrintScalar(bool print_sorted = false)
{
if (print_sorted == false)
{this->v_prp.template deviceToHost<prp>();}
else
{this->v_prp_out.template deviceToHost<prp>();}
auto it = this->getDomainIterator();
while(it.isNext())
{
auto p = it.get();
if (print_sorted == false)
{std::cout << v_prp_out.template get<prp>(p.getKey()) << " " << std::endl;}
else
{std::cout << v_prp_out.template get<prp>(p.getKey()) << " " << std::endl;}
++it;
}
}
/*! \brief Merge the properties calculated on the sorted vector on the original vector
*
* \parameter Cell-list from which has been constructed the sorted vector
*
*/
template<unsigned int ... prp> void merge_sort_with_pos(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> & cl, size_t n_thr = 1024)
{
#if defined(__NVCC__)
auto ite = v_pos.getGPUIteratorTo(g_m,n_thr);
merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortedToSorted().toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortedToSorted().toKernel());
......
......@@ -1414,6 +1414,10 @@ public:
size_t & g_m,
size_t opt = WITH_POSITION)
{
#ifdef PROFILE_SCOREP
SCOREP_USER_REGION("ghost_get",SCOREP_USER_REGION_TYPE_FUNCTION)
#endif
// Sending property object
typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object;
......@@ -1436,34 +1440,32 @@ public:
{labelParticlesGhost(v_pos,v_prp,prc_g_opart,prc_sz,prc_offset,g_m,opt);}
// Send and receive ghost particle information
{
openfpm::vector<send_vector> g_send_prp;
fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz,g_send_prp,opt);
openfpm::vector<send_vector> g_send_prp;
fill_send_ghost_prp_buf<send_vector, prp_object, prp...>(v_prp,prc_sz,g_send_prp,opt);
#if defined(CUDA_GPU) && defined(__NVCC__)
cudaDeviceSynchronize();
cudaDeviceSynchronize();
#endif
// if there are no properties skip
// SSendRecvP send everything when we do not give properties
// if there are no properties skip
// SSendRecvP send everything when we do not give properties
if (sizeof...(prp) != 0)
if (sizeof...(prp) != 0)
{
size_t opt_ = compute_options(opt);
if (opt & SKIP_LABELLING)
{
size_t opt_ = compute_options(opt);
if (opt & SKIP_LABELLING)
{
op_ssend_gg_recv_merge opm(g_m);
v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
}
else
{v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
// fill g_opart_sz
g_opart_sz.resize(prc_g_opart.size());
for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
g_opart_sz.get(i) = g_send_prp.get(i).size();
op_ssend_gg_recv_merge opm(g_m);
v_cl.template SSendRecvP_op<op_ssend_gg_recv_merge,send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,opm,prc_recv_get,recv_sz_get,opt_);
}
else
{v_cl.template SSendRecvP<send_vector,decltype(v_prp),layout_base,prp...>(g_send_prp,v_prp,prc_g_opart,prc_recv_get,recv_sz_get,recv_sz_get_byte,opt_);}
// fill g_opart_sz
g_opart_sz.resize(prc_g_opart.size());
for (size_t i = 0 ; i < prc_g_opart.size() ; i++)
g_opart_sz.get(i) = g_send_prp.get(i).size();
}
if (!(opt & NO_POSITION))
......@@ -1599,6 +1601,10 @@ public:
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, size_t & g_m,
size_t opt)
{
#ifdef PROFILE_SCOREP
SCOREP_USER_REGION("map",SCOREP_USER_REGION_TYPE_FUNCTION)
#endif
prc_sz.resize(v_cl.getProcessingUnits());
// map completely reset the ghost part
......
......@@ -13,6 +13,7 @@
#define POS_PROP -1
#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x > vd.size()) {return;};
#define GET_PARTICLE_SORT(NN) NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x); if (blockDim.x*blockIdx.x + threadIdx.x > NN.getDomainSortIds().size()) {return;};
template<unsigned int dim,
typename St,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment