Commit 1974b8bc authored by incardon's avatar incardon
Browse files

Fixing GPU tests for pdata

parent 11ddfc33
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(openfpm_pdata LANGUAGES C CXX)
enable_testing()
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
......
......@@ -215,6 +215,7 @@ do
case $ac_useropt in
debug)
conf_options="$conf_options -DCMAKE_BUILD_TYPE=Debug"
debug_mode=1
;;
se_class1)
conf_options="$conf_options -DSE_CLASS1=ON"
......@@ -524,6 +525,10 @@ Try \`$0 --help' for more information"
esac
done
if [ x"$debug_mode" != x"1" ]; then
conf_options+="$conf_options -DCMAKE_BUILD_TYPE=Release"
fi
cd build
## remove enerything
......
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if(CUDA_FOUND)
......@@ -16,6 +17,9 @@ endif()
add_library(ofpm_pdata STATIC lib/pdata.cpp)
add_test(NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata)
add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata)
###########################
if (CUDA_FOUND)
......
......@@ -105,7 +105,7 @@ public:
}
CartDecomposition_gpu(const CartDecomposition_gpu<dim,T,Memory,layout_base> & dec)
:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain)
:ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain),sub_domains_global(dec.sub_domains_global)
{
for (int s = 0 ; s < dim ; s++)
{this->bc[s] = dec.bc[s];}
......
......@@ -41,7 +41,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
// Vcluster
Vcluster<> & vcl = create_vcluster();
CartDecomposition<3, double> dec(vcl);
CartDecomposition<3, double, CudaMemory,memory_traits_inte> dec(vcl);
size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
......@@ -87,6 +87,8 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
mem2.deviceToHost();
unsigned int tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
openfpm::vector_gpu<aggregate<int,int>> vd;
......@@ -119,12 +121,16 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
mem.deviceToHost();
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[0] < vcl.size());
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
mem2.deviceToHost();
tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
vd.resize(tot);
......@@ -132,7 +138,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
{
if (vcl.rank() == ((unsigned int *)mem.getPointer())[2])
if (vcl.rank() == ((unsigned int *)mem.getPointer())[1])
{
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] != 0);
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] == 0);
......
......@@ -448,12 +448,17 @@ protected:
reorder_geo_cell();
}
/*! \brief in this function we reorder the cells by processors
/*! \brief in this function we reorder the list in each cells by processor id
*
* In practice every processor in the list is ordered. the geo_cell give
* suppose in one cell we have 7 boxes each box contain the processor id
*
* 7 boxes the first 2 boxes are related to processor 0 and the next 2 to processor 4, the other 3 must me related
* to another processor different from 0 and 4. This simplify the procedure to get a unique list of processor ids
* 1,5,9,5,1,1,6
*
* after reorder we have the following sequence
*
* 1,1,1,5,5,6,9
*
* This simplify the procedure to get a unique list of processor ids
* indicating on which processor a particle must be replicated as ghost
*
*/
......
......@@ -108,29 +108,39 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
mem.fill(0);
prc_offset.resize(v_cl.size());
ite = g_opart_device.getGPUIterator();
// Find the buffer bases
find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());
// Trasfer the number of offsets on CPU
mem.deviceToHost();
prc_offset.template deviceToHost<0,1>();
if (g_opart_device.size() != 0)
{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
int noff = *(int *)mem.getPointer();
// In this case we do not have communications at all
if (g_opart_device.size() == 0)
{noff = -1;}
// create the terminal of prc_offset
prc_offset.resize(noff+1,DATA_ON_DEVICE);
prc_offset.resize(noff+1);
// Move the last processor index on device (id)
if (g_opart_device.size() != 0)
{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
if (g_opart_device.size() != 0)
{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
else
{prc_offset.template get<1>(prc_offset.size()-1) = 0;}
prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);
// Here we reorder the offsets in ascending order
mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
prc_offset.template deviceToHost<0,1>();
// In this case we do not have communications at all
if (g_opart_device.size() == 0)
{noff = -1;}
prc.resize(noff+1);
prc_sz.resize(noff+1);
......@@ -236,9 +246,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
auto ite = v_pos.getGPUIteratorTo(g_m);
// label particle processor
num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
starts.resize(o_part_loc.size());
mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
......
......@@ -98,9 +98,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
v_prp.hostToDevice<0,1,2>();
// label particle processor
num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size());
o_part_loc.deviceToHost<0>();
......@@ -734,11 +734,16 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
proc_id_out.resize(vg.size());
openfpm::vector_gpu<aggregate<int,int,int>> dev_counter;
dev_counter.resize(10);
dev_counter.fill<0>(0);
dev_counter.fill<1>(0);
dev_counter.fill<2>(0);
process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
proc_id_out.deviceToHost<0>();
bool match = true;
......@@ -852,33 +857,33 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
v_pos.template get<0>(i)[0] = i;
v_pos.template get<0>(i)[1] = i+10000;
v_pos.template get<0>(i)[2] = i+20000;
v_pos_out.template get<0>(i)[0] = i;
v_pos_out.template get<0>(i)[1] = i+10000;
v_pos_out.template get<0>(i)[2] = i+20000;
v_prp.template get<0>(i)[0] = i+60123;
v_prp.template get<0>(i)[1] = i+73543;
v_prp.template get<0>(i)[2] = i+82432;
v_prp_out.template get<0>(i)[0] = i+60123;
v_prp_out.template get<0>(i)[1] = i+73543;
v_prp_out.template get<0>(i)[2] = i+82432;
v_prp.template get<1>(i)[0] = i+80123;
v_prp.template get<1>(i)[1] = i+93543;
v_prp.template get<1>(i)[2] = i+102432;
v_prp_out.template get<1>(i)[0] = i+80123;
v_prp_out.template get<1>(i)[1] = i+93543;
v_prp_out.template get<1>(i)[2] = i+102432;
v_prp.template get<2>(i)[0] = i+110123;
v_prp.template get<2>(i)[1] = i+123543;
v_prp.template get<2>(i)[2] = i+132432;
v_prp_out.template get<2>(i)[0] = i+110123;
v_prp_out.template get<2>(i)[1] = i+123543;
v_prp_out.template get<2>(i)[2] = i+132432;
v_prp_out.template get<0>(i)[0] = 0;
v_prp_out.template get<0>(i)[1] = 0;
v_prp_out.template get<0>(i)[2] = 0;
v_prp.template get<0>(i)[0] = 0;
v_prp.template get<0>(i)[1] = 0;
v_prp.template get<0>(i)[2] = 0;
v_prp_out.template get<1>(i)[0] = 0;
v_prp_out.template get<1>(i)[1] = 0;
v_prp_out.template get<1>(i)[2] = 0;
v_prp.template get<1>(i)[0] = 0;
v_prp.template get<1>(i)[1] = 0;
v_prp.template get<1>(i)[2] = 0;
v_prp_out.template get<2>(i)[0] = 0;
v_prp_out.template get<2>(i)[1] = 0;
v_prp_out.template get<2>(i)[2] = 0;
v_prp.template get<2>(i)[0] = 0;
v_prp.template get<2>(i)[1] = 0;
v_prp.template get<2>(i)[2] = 0;
ns_to_s.template get<0>(i) = 10000-i-1;
}
......@@ -895,7 +900,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
v_prp_out.template deviceToHost<0,1,2>();
v_prp.template deviceToHost<0,1,2>();
bool match = true;
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
......@@ -904,13 +909,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
match &= v_prp_out.template get<1>(10000-i-1)[0] == 0;
match &= v_prp_out.template get<1>(10000-i-1)[1] == 0;
match &= v_prp_out.template get<1>(10000-i-1)[2] == 0;
match &= v_prp.template get<1>(10000-i-1)[0] == 0;
match &= v_prp.template get<1>(10000-i-1)[1] == 0;
match &= v_prp.template get<1>(10000-i-1)[2] == 0;
match &= v_prp_out.template get<2>(10000-i-1)[0] == 0;
match &= v_prp_out.template get<2>(10000-i-1)[1] == 0;
match &= v_prp_out.template get<2>(10000-i-1)[2] == 0;
match &= v_prp.template get<2>(10000-i-1)[0] == 0;
match &= v_prp.template get<2>(10000-i-1)[1] == 0;
match &= v_prp.template get<2>(10000-i-1)[2] == 0;
}
BOOST_REQUIRE_EQUAL(match,true);
......@@ -919,8 +924,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
v_prp_out.template deviceToHost<0,1,2>();
v_pos_out.template deviceToHost<0>();
v_prp.template deviceToHost<0,1,2>();
v_pos.template deviceToHost<0>();
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
......@@ -937,9 +942,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
match &= v_prp_out.template get<2>(10000-i-1)[2] == v_prp.template get<2>(i)[2];
match &= v_pos_out.template get<0>(10000-i-1)[0] == 0;
match &= v_pos_out.template get<0>(10000-i-1)[1] == 0;
match &= v_pos_out.template get<0>(10000-i-1)[2] == 0;
match &= v_pos.template get<0>(10000-i-1)[0] == 0;
match &= v_pos.template get<0>(10000-i-1)[1] == 0;
match &= v_pos.template get<0>(10000-i-1)[2] == 0;
}
BOOST_REQUIRE_EQUAL(match,true);
......@@ -948,11 +953,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
v_prp_out.template deviceToHost<0,1,2>();
v_pos_out.template deviceToHost<0>();
v_prp.template deviceToHost<0,1,2>();
v_pos.template deviceToHost<0>();
for (int i = 0 ; i < 10000 ; i++) // <------ particle id
{
match &= v_prp_out.template get<0>(10000-i-1)[0] == v_prp.template get<0>(i)[0];
match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1];
match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2];
......@@ -1010,8 +1017,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/RAND_MAX;
int seg = i / 10000;
m_opart.template get<0>(i) = seg;
m_opart.template get<1>(i) = (9999 - i%10000) + seg * 10000;
m_opart.template get<1>(i) = seg;
m_opart.template get<0>(i) = (9999 - i%10000) + seg * 10000;
}
m_pos.resize(10);
......@@ -1045,24 +1052,24 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
for (size_t j = 0 ; j < m_pos.get(i).size() ; j++)
{
match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[0]);
match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[1]);
match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[2]);
match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<1>(offset+j)));
match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[0]);
match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[1]);
match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][0]);
match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][1]);
match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][2]);
match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][0]);
match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][1]);
match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][2]);
match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][0]);
match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][1]);
match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][2]);
match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[0]);
match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[1]);
match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[2]);
match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<0>(offset+j)));
match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<0>(offset+j))[0]);
match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<0>(offset+j))[1]);
match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][0]);
match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][1]);
match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][2]);
match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][0]);
match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][1]);
match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][2]);
match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][0]);
match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][1]);
match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][2]);
}
BOOST_REQUIRE_EQUAL(match,true);
......
......@@ -63,7 +63,7 @@ __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p));
}
vd_prp.template set<prp...>(p,vd_prp_ord,nss.template get<0>(p));
vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p));
}
template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
......@@ -99,7 +99,7 @@ __global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
{
int i = atomicAdd(cnt, 1);
offs.template get<0>(i) = p+1;
offs.template get<1>(i) = vd.template get<1>(p);
offs.template get<1>(i) = vd.template get<prp_off>(p);
}
}
......@@ -184,12 +184,13 @@ __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vecto
v_prp.set(base+i,v_prp.get(pid));
}
template<unsigned int dim, typename St, typename vector_of_box, typename vector_type, typename output_type>
__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd, output_type out)
template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type, typename output_type>
__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd, output_type out, unsigned int g_m)
{
unsigned int old_shift = (unsigned int)-1;
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= vd.size()) return;
if (p >= g_m) return;
Point<dim,St> xp = vd.template get<0>(p);
......@@ -197,8 +198,14 @@ __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd,
for (unsigned int i = 0 ; i < box_f.size() ; i++)
{
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true)
{n++;}
unsigned int shift_actual = box_f_sv.template get<0>(i);
bool sw = (old_shift == shift_actual)?true:false;
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
{
old_shift = shift_actual;
n++;
}
}
out.template get<0>(p) = n;
......@@ -217,6 +224,7 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
start_type start, shifts_type shifts,
output_type output, unsigned int offset)
{
unsigned int old_shift = (unsigned int)-1;
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= v_pos.size()) return;
......@@ -231,20 +239,23 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
for (unsigned int i = 0 ; i < box_f.size() ; i++)
{
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true)
unsigned int shift_actual = box_f_sv.template get<0>(i);
bool sw = (old_shift == shift_actual)?true:false;
if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false)
{
unsigned int shift_id = box_f_sv.template get<0>(i);
#pragma unroll
for (unsigned int j = 0 ; j < dim ; j++)
{
v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_id)[j];
v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j];
output.template get<0>(base_o+n) = p;
output.template get<1>(base_o+n) = shift_id;
output.template get<1>(base_o+n) = shift_actual;
}
v_prp.set(base+n,v_prp.get(p));
old_shift = shift_actual;
n++;
}
}
......
......@@ -50,8 +50,6 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
auto it = cl.getNNIterator(cl.getCell(xp));
auto cell = cl.getCell(xp);
Point<3,float> force1({0.0,0.0,0.0});
Point<3,float> force2({0.0,0.0,0.0});
......@@ -95,14 +93,10 @@ __global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f
unsigned int p;
GET_PARTICLE_SORT(p,cl);
unsigned int ns_id = cl.getSortToNonSort().template get<0>(p);
Point<3,float> xp = vd.getPos(p);
auto it = cl.getNNIterator(cl.getCell(xp));
auto cell = cl.getCell(xp);
Point<3,float> force1({0.0,0.0,0.0});
while (it.isNext())
......@@ -159,6 +153,12 @@ bool check_force(CellList_type & NN_cpu, vector_type & vd)
// Normalize
if (r2.norm() == 0)
{
int debug = 0;
debug++;
}
r2 /= r2.norm();
force += vd.template getProp<0>(q)*r2;
......@@ -314,15 +314,23 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(10000,domain,bc,g);
srand(55067*create_vcluster().rank());
auto it = vd.getDomainIterator();
while (it.isNext())
{
auto p = it.get();
vd.getPos(p)[0] = (float)rand() / RAND_MAX;
vd.getPos(p)[1] = (float)rand() / RAND_MAX;
vd.getPos(p)[2] = (float)rand() / RAND_MAX;
int x = rand();
int y = rand();
int z = rand();
vd.getPos(p)[0] = (float)x / RAND_MAX;
vd.getPos(p)[1] = (float)y / RAND_MAX;
vd.getPos(p)[2] = (float)z / RAND_MAX;
Point<3,float> xp = vd.getPos(p);
++it;
}
......@@ -358,8 +366,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
BOOST_REQUIRE_EQUAL(noOut,true);
BOOST_REQUIRE_EQUAL(cnt,vd.size_local());
vd.write("test_out_gpu");
// now we offload all the properties
auto it3 = vd.getDomainIteratorGPU();
......@@ -426,6 +432,7 @@ void vdist_calc_gpu_test()
vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vd(1000,domain,bc,g);
srand(v_cl.rank()*10000);
auto it = vd.getDomainIterator();
while (it.isNext())
......@@ -459,8 +466,6 @@ void vdist_calc_gpu_test()
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
vd.write("write_start");