Commit 1974b8bc authored by incardon's avatar incardon

Fixing GPU tests for pdata

parent 11ddfc33
Pipeline #856 failed with stages
in 23 seconds
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(openfpm_pdata LANGUAGES C CXX)
enable_testing()
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
......
......@@ -215,6 +215,7 @@ do
case $ac_useropt in
debug)
conf_options="$conf_options -DCMAKE_BUILD_TYPE=Debug"
debug_mode=1
;;
se_class1)
conf_options="$conf_options -DSE_CLASS1=ON"
......@@ -524,6 +525,10 @@ Try \`$0 --help' for more information"
esac
done
if [ x"$debug_mode" != x"1" ]; then
conf_options+="$conf_options -DCMAKE_BUILD_TYPE=Release"
fi
cd build
## remove enerything
......
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if(CUDA_FOUND)
......@@ -16,6 +17,9 @@ endif()
add_library(ofpm_pdata STATIC lib/pdata.cpp)
add_test(NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata)
add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata)
###########################
if (CUDA_FOUND)
......
......@@ -105,7 +105,7 @@ public:
}
CartDecomposition_gpu(const CartDecomposition_gpu<dim,T,Memory,layout_base> & dec)
:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain)
:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain),sub_domains_global(dec.sub_domains_global)
{
for (int s = 0 ; s < dim ; s++)
{this->bc[s] = dec.bc[s];}
......
......@@ -41,7 +41,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
// Vcluster
Vcluster<> & vcl = create_vcluster();
CartDecomposition<3, double> dec(vcl);
CartDecomposition<3, double, CudaMemory,memory_traits_inte> dec(vcl);
size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
......@@ -87,6 +87,8 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
mem2.deviceToHost();
unsigned int tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
openfpm::vector_gpu<aggregate<int,int>> vd;
......@@ -119,12 +121,16 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
mem.deviceToHost();
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[0] < vcl.size());
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
mem2.deviceToHost();
tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
vd.resize(tot);
......@@ -132,7 +138,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
{
if (vcl.rank() == ((unsigned int *)mem.getPointer())[2])
if (vcl.rank() == ((unsigned int *)mem.getPointer())[1])
{
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] != 0);
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] == 0);
......
......@@ -448,12 +448,17 @@ protected:
reorder_geo_cell();
}
/*! \brief in this function we reorder the cells by processors
/*! \brief in this function we reorder the list in each cells by processor id
*
* In practice every processor in the list is ordered. the geo_cell give
* suppose in one cell we have 7 boxes each box contain the processor id
*
* 7 boxes the first 2 boxes are related to processor 0 and the next 2 to processor 4, the other 3 must me related
* to another processor different from 0 and 4. This simplify the procedure to get a unique list of processor ids
* 1,5,9,5,1,1,6
*
* after reorder we have the following sequence
*
* 1,1,1,5,5,6,9
*
* This simplify the procedure to get a unique list of processor ids
* indicating on which processor a particle must be replicated as ghost
*
*/
......
......@@ -108,29 +108,39 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
mem.fill(0);
prc_offset.resize(v_cl.size());
ite = g_opart_device.getGPUIterator();
// Find the buffer bases
find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
// Trasfer the number of offsets on CPU
mem.deviceToHost();
prc_offset.template deviceToHost<0,1>();
if (g_opart_device.size() != 0)
{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
int noff = *(int *)mem.getPointer();
// In this case we do not have communications at all
if (g_opart_device.size() == 0)
{noff = -1;}
// create the terminal of prc_offset
prc_offset.resize(noff+1,DATA_ON_DEVICE);
prc_offset.resize(noff+1);
// Move the last processor index on device (id)
if (g_opart_device.size() != 0)
{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
if (g_opart_device.size() != 0)
{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
else
{prc_offset.template get<1>(prc_offset.size()-1) = 0;}
prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
// Here we reorder the offsets in ascending order
mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
prc_offset.template deviceToHost<0,1>();
// In this case we do not have communications at all
if (g_opart_device.size() == 0)
{noff = -1;}
prc.resize(noff+1);
prc_sz.resize(noff+1);
......@@ -236,9 +246,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
auto ite = v_pos.getGPUIteratorTo(g_m);
// label particle processor
num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
starts.resize(o_part_loc.size());
mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
......
......@@ -63,7 +63,7 @@ __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
}
vd_prp.template set<prp...>(p,vd_prp_ord,nss.template get<0>(p));
vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
}
template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
......@@ -99,7 +99,7 @@ __global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
{
int i = atomicAdd(cnt, 1);
offs.template get<0>(i) = p+1;
offs.template get<1>(i) = vd.template get<1>(p);
offs.template get<1>(i) = vd.template get<prp_off>(p);
}
}
......@@ -184,12 +184,13 @@ __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vecto
v_prp.set(base+i,v_prp.get(pid));
}
template<unsigned int dim, typename St, typename vector_of_box, typename vector_type, typename output_type>
__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd, output_type out)
template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type, typename output_type>
__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd, output_type out, unsigned int g_m)
{
unsigned int old_shift = (unsigned int)-1;
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= vd.size()) return;
if (p >= g_m) return;
Point<dim,St> xp = vd.template get<0>(p);
......@@ -197,8 +198,14 @@ __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd,
for (unsigned int i = 0 ; i < box_f.size() ; i++)
{
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true)
{n++;}
unsigned int shift_actual = box_f_sv.template get<0>(i);
bool sw = (old_shift == shift_actual)?true:false;
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
{
old_shift = shift_actual;
n++;
}
}
out.template get<0>(p) = n;
......@@ -217,6 +224,7 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
start_type start, shifts_type shifts,
output_type output, unsigned int offset)
{
unsigned int old_shift = (unsigned int)-1;
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= v_pos.size()) return;
......@@ -231,20 +239,23 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
for (unsigned int i = 0 ; i < box_f.size() ; i++)
{
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true)
unsigned int shift_actual = box_f_sv.template get<0>(i);
bool sw = (old_shift == shift_actual)?true:false;
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
{
unsigned int shift_id = box_f_sv.template get<0>(i);
#pragma unroll
for (unsigned int j = 0 ; j < dim ; j++)
{
v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_id)[j];
v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
output.template get<0>(base_o+n) = p;
output.template get<1>(base_o+n) = shift_id;
output.template get<1>(base_o+n) = shift_actual;
}
v_prp.set(base+n,v_prp.get(p));
old_shift = shift_actual;
n++;
}
}
......
......@@ -50,8 +50,6 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
auto it = cl.getNNIterator(cl.getCell(xp));
auto cell = cl.getCell(xp);
Point<3,float> force1({0.0,0.0,0.0});
Point<3,float> force2({0.0,0.0,0.0});
......@@ -95,14 +93,10 @@ __global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f
unsigned int p;
GET_PARTICLE_SORT(p,cl);
unsigned int ns_id = cl.getSortToNonSort().template get<0>(p);
Point<3,float> xp = vd.getPos(p);
auto it = cl.getNNIterator(cl.getCell(xp));
auto cell = cl.getCell(xp);
Point<3,float> force1({0.0,0.0,0.0});
while (it.isNext())
......@@ -159,6 +153,12 @@ bool check_force(CellList_type & NN_cpu, vector_type & vd)
// Normalize
if (r2.norm() == 0)
{
int debug = 0;
debug++;
}
r2 /= r2.norm();
force += vd.template getProp<0>(q)*r2;
......@@ -314,15 +314,23 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(10000,domain,bc,g);
srand(55067*create_vcluster().rank());
auto it = vd.getDomainIterator();
while (it.isNext())
{
auto p = it.get();
vd.getPos(p)[0] = (float)rand() / RAND_MAX;
vd.getPos(p)[1] = (float)rand() / RAND_MAX;
vd.getPos(p)[2] = (float)rand() / RAND_MAX;
int x = rand();
int y = rand();
int z = rand();
vd.getPos(p)[0] = (float)x / RAND_MAX;
vd.getPos(p)[1] = (float)y / RAND_MAX;
vd.getPos(p)[2] = (float)z / RAND_MAX;
Point<3,float> xp = vd.getPos(p);
++it;
}
......@@ -358,8 +366,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
BOOST_REQUIRE_EQUAL(noOut,true);
BOOST_REQUIRE_EQUAL(cnt,vd.size_local());
vd.write("test_out_gpu");
// now we offload all the properties
auto it3 = vd.getDomainIteratorGPU();
......@@ -426,6 +432,7 @@ void vdist_calc_gpu_test()
vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vd(1000,domain,bc,g);
srand(v_cl.rank()*10000);
auto it = vd.getDomainIterator();
while (it.isNext())
......@@ -459,8 +466,6 @@ void vdist_calc_gpu_test()
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
vd.write("write_start");
// Reset the host part
auto it3 = vd.getDomainIterator();
......@@ -542,9 +547,6 @@ void vdist_calc_gpu_test()
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
vd.write_frame("write_ggg",i);
// To test we copy on a cpu distributed vector and we do a map
vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
......@@ -620,6 +622,8 @@ void vdist_calc_gpu_test()
cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local());
BOOST_REQUIRE_EQUAL(cpu_sort.size(),gpu_sort.size());
size_t cnt = 0;
auto itc2 = vd.getGhostIterator();
......
......@@ -1192,6 +1192,7 @@ public:
cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m);
cell_list.set_ndec(getDecomposition().get_ndec());
cell_list.set_gm(g_m);
return cell_list;
}
......
......@@ -8,7 +8,7 @@
#ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
#define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
//#define TEST1
#define TEST1
#if defined(CUDA_GPU) && defined(__NVCC__)
#include "util/cuda/moderngpu/kernel_mergesort.hxx"
......@@ -285,6 +285,21 @@ class vector_dist_comm
if (shift_box_ndec == (long int)dec.get_ndec())
return;
struct sh_box
{
size_t shift_id;
unsigned int box_f_sv;
Box<dim,St> box_f_dev;
bool operator<(const sh_box & tmp)
{
return shift_id < tmp.shift_id;
}
};
openfpm::vector<struct sh_box> reord_shift;
// Add local particles coming from periodic boundary, the only boxes that count are the one
// touching the border
for (size_t i = 0; i < dec.getNLocalSub(); i++)
......@@ -307,23 +322,32 @@ class vector_dist_comm
box_f.last().add(dec.getLocalIGhostBox(i, j));
box_cmb.add(dec.getLocalIGhostPos(i, j));
map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1;
box_f_dev.add(dec.getLocalIGhostBox(i, j));
box_f_sv.add();
box_f_sv.template get<0>(box_f_sv.size()-1) = dec.convertShift(dec.getLocalIGhostPos(i, j));
}
else
{
// we have it
box_f.get(it->second).add(dec.getLocalIGhostBox(i, j));
box_f_dev.add(dec.getLocalIGhostBox(i, j));
box_f_sv.template get<0>(box_f_sv.size()-1) = dec.convertShift(dec.getLocalIGhostPos(i, j));
}
reord_shift.add();
reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin();
reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j);
reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j));
}
}
// now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles
reord_shift.sort();
box_f_dev.resize(reord_shift.size());
box_f_sv.resize(reord_shift.size());
for (size_t i = 0 ; i < reord_shift.size() ; i++)
{
box_f_dev.get(i) = reord_shift.get(i).box_f_dev;
box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv;
}
#ifdef CUDA_GPU
// move box_f_dev and box_f_sv to device
......@@ -547,7 +571,7 @@ class vector_dist_comm
const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors();
// create a number of send buffers equal to the near processors
g_pos_send.resize(g_opart.size());
g_pos_send.resize(prc_sz.size());
resize_retained_buffer(hsmem,g_pos_send.size());
......@@ -894,7 +918,7 @@ class vector_dist_comm
size_t offset = prc_sz.template get<0>(0);
// Fill the sending fuffers
// Fill the sending buffers
for (size_t i = 0 ; i < m_pos.size() ; i++)
{
auto ite = m_pos.get(i).getGPUIterator();
......@@ -1074,6 +1098,7 @@ class vector_dist_comm
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
#ifndef TEST1
// sort particles
......@@ -1123,9 +1148,10 @@ class vector_dist_comm
{
// reset lbl_p
lbl_p.clear();
prc_sz_gg.clear();
o_part_loc.clear();
g_opart.clear();
g_opart.resize(dec.getNNProcessors());
prc_g_opart.clear();
// resize the label buffer
prc_sz.template fill<0>(0);
......
......@@ -12,7 +12,7 @@
#define POS_PROP -1
#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x > vd.size()) {return;};
#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= vd.size()) {return;};
#define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\
else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment