From 1974b8bcb89eeb0d4b006eded77dfa712f312ec9 Mon Sep 17 00:00:00 2001 From: Pietro Incardona <incardon@mpi-cbg.de> Date: Mon, 12 Nov 2018 19:00:43 +0100 Subject: [PATCH] Fixing GPU tests for pdata --- CMakeLists.txt | 1 + configure | 5 + src/CMakeLists.txt | 4 + .../cuda/CartDecomposition_gpu.cuh | 2 +- .../cuda/decomposition_cuda_tests.cu | 10 +- src/Decomposition/ie_ghost.hpp | 13 +- .../cuda/vector_dist_comm_util_funcs.cuh | 30 +++-- src/Vector/cuda/vector_dist_cuda_func_test.cu | 121 +++++++++--------- src/Vector/cuda/vector_dist_cuda_funcs.cuh | 33 +++-- src/Vector/cuda/vector_dist_gpu_unit_tests.cu | 36 +++--- src/Vector/vector_dist.hpp | 1 + src/Vector/vector_dist_comm.hpp | 48 +++++-- src/Vector/vector_dist_kernel.hpp | 2 +- 13 files changed, 193 insertions(+), 113 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b6d6967f..0c1b0193 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) project(openfpm_pdata LANGUAGES C CXX) +enable_testing() list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/) diff --git a/configure b/configure index d04dd1f6..090620f0 100755 --- a/configure +++ b/configure @@ -215,6 +215,7 @@ do case $ac_useropt in debug) conf_options="$conf_options -DCMAKE_BUILD_TYPE=Debug" + debug_mode=1 ;; se_class1) conf_options="$conf_options -DSE_CLASS1=ON" @@ -524,6 +525,10 @@ Try \`$0 --help' for more information" esac done +if [ x"$debug_mode" != x"1" ]; then + conf_options+="$conf_options -DCMAKE_BUILD_TYPE=Release" +fi + cd build ## remove enerything diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 41f96ed9..31317c79 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,6 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) + ########################### Executables if(CUDA_FOUND) @@ -16,6 +17,9 @@ endif() add_library(ofpm_pdata STATIC lib/pdata.cpp) +add_test(NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata) +add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata) + ########################### if (CUDA_FOUND) diff --git a/src/Decomposition/cuda/CartDecomposition_gpu.cuh b/src/Decomposition/cuda/CartDecomposition_gpu.cuh index d986d1d7..de38fe76 100644 --- a/src/Decomposition/cuda/CartDecomposition_gpu.cuh +++ b/src/Decomposition/cuda/CartDecomposition_gpu.cuh @@ -105,7 +105,7 @@ public: } CartDecomposition_gpu(const CartDecomposition_gpu<dim,T,Memory,layout_base> & dec) - :ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain) + :ie_ghost_gpu<dim,T,Memory,layout_base>(dec),clk(dec.clk),domain(dec.domain),sub_domains_global(dec.sub_domains_global) { for (int s = 0 ; s < dim ; s++) {this->bc[s] = dec.bc[s];} diff --git a/src/Decomposition/cuda/decomposition_cuda_tests.cu b/src/Decomposition/cuda/decomposition_cuda_tests.cu index 2d40d636..81ae0b4a 100644 --- a/src/Decomposition/cuda/decomposition_cuda_tests.cu +++ b/src/Decomposition/cuda/decomposition_cuda_tests.cu @@ -41,7 +41,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb // Vcluster Vcluster<> & vcl = create_vcluster(); - CartDecomposition<3, double> dec(vcl); + CartDecomposition<3, double, CudaMemory,memory_traits_inte> dec(vcl); size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC}; @@ -87,6 +87,8 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb mem2.allocate(2*sizeof(unsigned int)); test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer()); + mem2.deviceToHost(); + unsigned int tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1]; openfpm::vector_gpu<aggregate<int,int>> vd; @@ -119,12 +121,16 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer()); + mem.deviceToHost(); + BOOST_REQUIRE(((unsigned int *)mem.getPointer())[0] < vcl.size()); BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size()); mem2.allocate(2*sizeof(unsigned int)); test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer()); + mem2.deviceToHost(); + tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1]; vd.resize(tot); @@ -132,7 +138,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1]) { - if (vcl.rank() == ((unsigned int *)mem.getPointer())[2]) + if (vcl.rank() == ((unsigned int *)mem.getPointer())[1]) { BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] != 0); BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] == 0); diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp index a6df7964..f78b40c8 100755 --- a/src/Decomposition/ie_ghost.hpp +++ b/src/Decomposition/ie_ghost.hpp @@ -448,12 +448,17 @@ protected: reorder_geo_cell(); } - /*! \brief in this function we reorder the cells by processors + /*! \brief in this function we reorder the list in each cells by processor id * - * In practice every processor in the list is ordered. the geo_cell give + * suppose in one cell we have 7 boxes each box contain the processor id * - * 7 boxes the first 2 boxes are related to processor 0 and the next 2 to processor 4, the other 3 must me related - * to another processor different from 0 and 4. This simplify the procedure to get a unique list of processor ids + * 1,5,9,5,1,1,6 + * + * after reorder we have the following sequence + * + * 1,1,1,5,5,6,9 + * + * This simplify the procedure to get a unique list of processor ids * indicating on which processor a particle must be replicated as ghost * */ diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh index a5988379..14b81d7b 100644 --- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh +++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh @@ -108,29 +108,39 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca mem.fill(0); prc_offset.resize(v_cl.size()); + ite = g_opart_device.getGPUIterator(); + // Find the buffer bases find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>> (g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel()); // Trasfer the number of offsets on CPU mem.deviceToHost(); - prc_offset.template deviceToHost<0,1>(); - if (g_opart_device.size() != 0) - {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);} - int noff = *(int *)mem.getPointer(); - // In this case we do not have communications at all - if (g_opart_device.size() == 0) - {noff = -1;} + // create the terminal of prc_offset + prc_offset.resize(noff+1,DATA_ON_DEVICE); - prc_offset.resize(noff+1); + // Move the last processor index on device (id) + if (g_opart_device.size() != 0) + {g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);} prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size(); if (g_opart_device.size() != 0) {prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);} else {prc_offset.template get<1>(prc_offset.size()-1) = 0;} + prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1); + + // Here we reorder the offsets in ascending order + mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); + + prc_offset.template deviceToHost<0,1>(); + + // In this case we do not have communications at all + if (g_opart_device.size() == 0) + {noff = -1;} + prc.resize(noff+1); prc_sz.resize(noff+1); @@ -236,9 +246,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true> auto ite = v_pos.getGPUIteratorTo(g_m); // label particle processor - num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())> + num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())> <<<ite.wthr,ite.thr>>> - (box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel()); + (box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m); starts.resize(o_part_loc.size()); mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext()); diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu index 243d6fd0..b866e63e 100644 --- a/src/Vector/cuda/vector_dist_cuda_func_test.cu +++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu @@ -98,9 +98,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles ) v_prp.hostToDevice<0,1,2>(); // label particle processor - num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())> + num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())> <<<ite.wthr,ite.thr>>> - (box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel()); + (box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size()); o_part_loc.deviceToHost<0>(); @@ -734,11 +734,16 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use ) proc_id_out.resize(vg.size()); openfpm::vector_gpu<aggregate<int,int,int>> dev_counter; + dev_counter.resize(10); + dev_counter.fill<0>(0); + dev_counter.fill<1>(0); + dev_counter.fill<2>(0); process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())> <<<ite.wthr,ite.thr>>> (dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank()); + proc_id_out.deviceToHost<0>(); bool match = true; @@ -852,33 +857,33 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) for (int i = 0 ; i < 10000 ; i++) // <------ particle id { - v_pos.template get<0>(i)[0] = i; - v_pos.template get<0>(i)[1] = i+10000; - v_pos.template get<0>(i)[2] = i+20000; + v_pos_out.template get<0>(i)[0] = i; + v_pos_out.template get<0>(i)[1] = i+10000; + v_pos_out.template get<0>(i)[2] = i+20000; - v_prp.template get<0>(i)[0] = i+60123; - v_prp.template get<0>(i)[1] = i+73543; - v_prp.template get<0>(i)[2] = i+82432; + v_prp_out.template get<0>(i)[0] = i+60123; + v_prp_out.template get<0>(i)[1] = i+73543; + v_prp_out.template get<0>(i)[2] = i+82432; - v_prp.template get<1>(i)[0] = i+80123; - v_prp.template get<1>(i)[1] = i+93543; - v_prp.template get<1>(i)[2] = i+102432; + v_prp_out.template get<1>(i)[0] = i+80123; + v_prp_out.template get<1>(i)[1] = i+93543; + v_prp_out.template get<1>(i)[2] = i+102432; - v_prp.template get<2>(i)[0] = i+110123; - v_prp.template get<2>(i)[1] = i+123543; - v_prp.template get<2>(i)[2] = i+132432; + v_prp_out.template get<2>(i)[0] = i+110123; + v_prp_out.template get<2>(i)[1] = i+123543; + v_prp_out.template get<2>(i)[2] = i+132432; - v_prp_out.template get<0>(i)[0] = 0; - v_prp_out.template get<0>(i)[1] = 0; - v_prp_out.template get<0>(i)[2] = 0; + v_prp.template get<0>(i)[0] = 0; + v_prp.template get<0>(i)[1] = 0; + v_prp.template get<0>(i)[2] = 0; - v_prp_out.template get<1>(i)[0] = 0; - v_prp_out.template get<1>(i)[1] = 0; - v_prp_out.template get<1>(i)[2] = 0; + v_prp.template get<1>(i)[0] = 0; + v_prp.template get<1>(i)[1] = 0; + v_prp.template get<1>(i)[2] = 0; - v_prp_out.template get<2>(i)[0] = 0; - v_prp_out.template get<2>(i)[1] = 0; - v_prp_out.template get<2>(i)[2] = 0; + v_prp.template get<2>(i)[0] = 0; + v_prp.template get<2>(i)[1] = 0; + v_prp.template get<2>(i)[2] = 0; ns_to_s.template get<0>(i) = 10000-i-1; } @@ -895,7 +900,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) v_pos_out.toKernel(),v_prp_out.toKernel(), ns_to_s.toKernel()); - v_prp_out.template deviceToHost<0,1,2>(); + v_prp.template deviceToHost<0,1,2>(); bool match = true; for (int i = 0 ; i < 10000 ; i++) // <------ particle id @@ -904,13 +909,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1]; match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2]; - match &= v_prp_out.template get<1>(10000-i-1)[0] == 0; - match &= v_prp_out.template get<1>(10000-i-1)[1] == 0; - match &= v_prp_out.template get<1>(10000-i-1)[2] == 0; + match &= v_prp.template get<1>(10000-i-1)[0] == 0; + match &= v_prp.template get<1>(10000-i-1)[1] == 0; + match &= v_prp.template get<1>(10000-i-1)[2] == 0; - match &= v_prp_out.template get<2>(10000-i-1)[0] == 0; - match &= v_prp_out.template get<2>(10000-i-1)[1] == 0; - match &= v_prp_out.template get<2>(10000-i-1)[2] == 0; + match &= v_prp.template get<2>(10000-i-1)[0] == 0; + match &= v_prp.template get<2>(10000-i-1)[1] == 0; + match &= v_prp.template get<2>(10000-i-1)[2] == 0; } BOOST_REQUIRE_EQUAL(match,true); @@ -919,8 +924,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) v_pos_out.toKernel(),v_prp_out.toKernel(), ns_to_s.toKernel()); - v_prp_out.template deviceToHost<0,1,2>(); - v_pos_out.template deviceToHost<0>(); + v_prp.template deviceToHost<0,1,2>(); + v_pos.template deviceToHost<0>(); for (int i = 0 ; i < 10000 ; i++) // <------ particle id { @@ -937,9 +942,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) match &= v_prp_out.template get<2>(10000-i-1)[2] == v_prp.template get<2>(i)[2]; - match &= v_pos_out.template get<0>(10000-i-1)[0] == 0; - match &= v_pos_out.template get<0>(10000-i-1)[1] == 0; - match &= v_pos_out.template get<0>(10000-i-1)[2] == 0; + match &= v_pos.template get<0>(10000-i-1)[0] == 0; + match &= v_pos.template get<0>(10000-i-1)[1] == 0; + match &= v_pos.template get<0>(10000-i-1)[2] == 0; } BOOST_REQUIRE_EQUAL(match,true); @@ -948,11 +953,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) v_pos_out.toKernel(),v_prp_out.toKernel(), ns_to_s.toKernel()); - v_prp_out.template deviceToHost<0,1,2>(); - v_pos_out.template deviceToHost<0>(); + v_prp.template deviceToHost<0,1,2>(); + v_pos.template deviceToHost<0>(); for (int i = 0 ; i < 10000 ; i++) // <------ particle id { + + match &= v_prp_out.template get<0>(10000-i-1)[0] == v_prp.template get<0>(i)[0]; match &= v_prp_out.template get<0>(10000-i-1)[1] == v_prp.template get<0>(i)[1]; match &= v_prp_out.template get<0>(10000-i-1)[2] == v_prp.template get<0>(i)[2]; @@ -1010,8 +1017,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test) v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/RAND_MAX; int seg = i / 10000; - m_opart.template get<0>(i) = seg; - m_opart.template get<1>(i) = (9999 - i%10000) + seg * 10000; + m_opart.template get<1>(i) = seg; + m_opart.template get<0>(i) = (9999 - i%10000) + seg * 10000; } m_pos.resize(10); @@ -1045,24 +1052,24 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test) for (size_t j = 0 ; j < m_pos.get(i).size() ; j++) { - match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[0]); - match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[1]); - match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[2]); - - match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<1>(offset+j))); - - match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[0]); - match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[1]); - - match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][0]); - match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][1]); - match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][2]); - match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][0]); - match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][1]); - match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][2]); - match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][0]); - match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][1]); - match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][2]); + match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[0]); + match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[1]); + match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<0>(offset+j))[2]); + + match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<0>(offset+j))); + + match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<0>(offset+j))[0]); + match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<0>(offset+j))[1]); + + match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][0]); + match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][1]); + match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[0][2]); + match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][0]); + match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][1]); + match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[1][2]); + match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][0]); + match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][1]); + match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<0>(offset+j))[2][2]); } BOOST_REQUIRE_EQUAL(match,true); diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh index 1e5a0f45..658f6002 100644 --- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh +++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh @@ -63,7 +63,7 @@ __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp, vd_pos.template set<0>(p,v_pos_ord,nss.template get<0>(p)); } - vd_prp.template set<prp...>(p,vd_prp_ord,nss.template get<0>(p)); + vd_prp.template set<prp ...>(p,vd_prp_ord,nss.template get<0>(p)); } template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type> @@ -99,7 +99,7 @@ __global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs { int i = atomicAdd(cnt, 1); offs.template get<0>(i) = p+1; - offs.template get<1>(i) = vd.template get<1>(p); + offs.template get<1>(i) = vd.template get<prp_off>(p); } } @@ -184,12 +184,13 @@ __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vecto v_prp.set(base+i,v_prp.get(pid)); } -template<unsigned int dim, typename St, typename vector_of_box, typename vector_type, typename output_type> -__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd, output_type out) +template<unsigned int dim, typename St, typename vector_of_box, typename vector_of_shifts, typename vector_type, typename output_type> +__global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_f_sv, vector_type vd, output_type out, unsigned int g_m) { + unsigned int old_shift = (unsigned int)-1; int p = threadIdx.x + blockIdx.x * blockDim.x; - if (p >= vd.size()) return; + if (p >= g_m) return; Point<dim,St> xp = vd.template get<0>(p); @@ -197,8 +198,14 @@ __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd, for (unsigned int i = 0 ; i < box_f.size() ; i++) { - if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true) - {n++;} + unsigned int shift_actual = box_f_sv.template get<0>(i); + bool sw = (old_shift == shift_actual)?true:false; + + if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false) + { + old_shift = shift_actual; + n++; + } } out.template get<0>(p) = n; @@ -217,6 +224,7 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_ start_type start, shifts_type shifts, output_type output, unsigned int offset) { + unsigned int old_shift = (unsigned int)-1; int p = threadIdx.x + blockIdx.x * blockDim.x; if (p >= v_pos.size()) return; @@ -231,20 +239,23 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_ for (unsigned int i = 0 ; i < box_f.size() ; i++) { - if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true) + unsigned int shift_actual = box_f_sv.template get<0>(i); + bool sw = (old_shift == shift_actual)?true:false; + + if (Box<dim,St>(box_f.get(i)).isInsideNP(xp) == true && sw == false) { - unsigned int shift_id = box_f_sv.template get<0>(i); #pragma unroll for (unsigned int j = 0 ; j < dim ; j++) { - v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_id)[j]; + v_pos.template get<0>(base+n)[j] = xp.get(j) - shifts.template get<0>(shift_actual)[j]; output.template get<0>(base_o+n) = p; - output.template get<1>(base_o+n) = shift_id; + output.template get<1>(base_o+n) = shift_actual; } v_prp.set(base+n,v_prp.get(p)); + old_shift = shift_actual; n++; } } diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu index 03150957..988c735d 100644 --- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu +++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu @@ -50,8 +50,6 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa auto it = cl.getNNIterator(cl.getCell(xp)); - auto cell = cl.getCell(xp); - Point<3,float> force1({0.0,0.0,0.0}); Point<3,float> force2({0.0,0.0,0.0}); @@ -95,14 +93,10 @@ __global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f unsigned int p; GET_PARTICLE_SORT(p,cl); - unsigned int ns_id = cl.getSortToNonSort().template get<0>(p); - Point<3,float> xp = vd.getPos(p); auto it = cl.getNNIterator(cl.getCell(xp)); - auto cell = cl.getCell(xp); - Point<3,float> force1({0.0,0.0,0.0}); while (it.isNext()) @@ -159,6 +153,12 @@ bool check_force(CellList_type & NN_cpu, vector_type & vd) // Normalize + if (r2.norm() == 0) + { + int debug = 0; + debug++; + } + r2 /= r2.norm(); force += vd.template getProp<0>(q)*r2; @@ -314,15 +314,23 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test) vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(10000,domain,bc,g); + srand(55067*create_vcluster().rank()); + auto it = vd.getDomainIterator(); while (it.isNext()) { auto p = it.get(); - vd.getPos(p)[0] = (float)rand() / RAND_MAX; - vd.getPos(p)[1] = (float)rand() / RAND_MAX; - vd.getPos(p)[2] = (float)rand() / RAND_MAX; + int x = rand(); + int y = rand(); + int z = rand(); + + vd.getPos(p)[0] = (float)x / RAND_MAX; + vd.getPos(p)[1] = (float)y / RAND_MAX; + vd.getPos(p)[2] = (float)z / RAND_MAX; + + Point<3,float> xp = vd.getPos(p); ++it; } @@ -358,8 +366,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test) BOOST_REQUIRE_EQUAL(noOut,true); BOOST_REQUIRE_EQUAL(cnt,vd.size_local()); - vd.write("test_out_gpu"); - // now we offload all the properties auto it3 = vd.getDomainIteratorGPU(); @@ -426,6 +432,7 @@ void vdist_calc_gpu_test() vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vd(1000,domain,bc,g); + srand(v_cl.rank()*10000); auto it = vd.getDomainIterator(); while (it.isNext()) @@ -459,8 +466,6 @@ void vdist_calc_gpu_test() vd.deviceToHostPos(); vd.template deviceToHostProp<0,1,2>(); - vd.write("write_start"); - // Reset the host part auto it3 = vd.getDomainIterator(); @@ -542,9 +547,6 @@ void vdist_calc_gpu_test() vd.deviceToHostPos(); vd.template deviceToHostProp<0,1,2>(); - vd.write_frame("write_ggg",i); - - // To test we copy on a cpu distributed vector and we do a map vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0); @@ -620,6 +622,8 @@ void vdist_calc_gpu_test() cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local()); gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local()); + BOOST_REQUIRE_EQUAL(cpu_sort.size(),gpu_sort.size()); + size_t cnt = 0; auto itc2 = vd.getGhostIterator(); diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp index a3287a83..7efbcfa5 100644 --- a/src/Vector/vector_dist.hpp +++ b/src/Vector/vector_dist.hpp @@ -1192,6 +1192,7 @@ public: cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out,v_cl.getmgpuContext(),g_m); cell_list.set_ndec(getDecomposition().get_ndec()); + cell_list.set_gm(g_m); return cell_list; } diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp index 3acb334f..5f9dd10c 100644 --- a/src/Vector/vector_dist_comm.hpp +++ b/src/Vector/vector_dist_comm.hpp @@ -8,7 +8,7 @@ #ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_ #define SRC_VECTOR_VECTOR_DIST_COMM_HPP_ -//#define TEST1 +#define TEST1 #if defined(CUDA_GPU) && defined(__NVCC__) #include "util/cuda/moderngpu/kernel_mergesort.hxx" @@ -285,6 +285,21 @@ class vector_dist_comm if (shift_box_ndec == (long int)dec.get_ndec()) return; + struct sh_box + { + size_t shift_id; + + unsigned int box_f_sv; + Box<dim,St> box_f_dev; + + bool operator<(const sh_box & tmp) + { + return shift_id < tmp.shift_id; + } + + }; + openfpm::vector<struct sh_box> reord_shift; + // Add local particles coming from periodic boundary, the only boxes that count are the one // touching the border for (size_t i = 0; i < dec.getNLocalSub(); i++) @@ -307,23 +322,32 @@ class vector_dist_comm box_f.last().add(dec.getLocalIGhostBox(i, j)); box_cmb.add(dec.getLocalIGhostPos(i, j)); map_cmb[dec.getLocalIGhostPos(i, j).lin()] = box_f.size() - 1; - - box_f_dev.add(dec.getLocalIGhostBox(i, j)); - box_f_sv.add(); - box_f_sv.template get<0>(box_f_sv.size()-1) = dec.convertShift(dec.getLocalIGhostPos(i, j)); } else { // we have it box_f.get(it->second).add(dec.getLocalIGhostBox(i, j)); - - box_f_dev.add(dec.getLocalIGhostBox(i, j)); - box_f_sv.template get<0>(box_f_sv.size()-1) = dec.convertShift(dec.getLocalIGhostPos(i, j)); } + reord_shift.add(); + reord_shift.last().shift_id = dec.getLocalIGhostPos(i, j).lin(); + reord_shift.last().box_f_dev = dec.getLocalIGhostBox(i, j); + reord_shift.last().box_f_sv = dec.convertShift(dec.getLocalIGhostPos(i, j)); } } + // now we sort box_f by shift_id, the reason is that we have to avoid duplicated particles + reord_shift.sort(); + + box_f_dev.resize(reord_shift.size()); + box_f_sv.resize(reord_shift.size()); + + for (size_t i = 0 ; i < reord_shift.size() ; i++) + { + box_f_dev.get(i) = reord_shift.get(i).box_f_dev; + box_f_sv.template get<0>(i) = reord_shift.get(i).box_f_sv; + } + #ifdef CUDA_GPU // move box_f_dev and box_f_sv to device @@ -547,7 +571,7 @@ class vector_dist_comm const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors(); // create a number of send buffers equal to the near processors - g_pos_send.resize(g_opart.size()); + g_pos_send.resize(prc_sz.size()); resize_retained_buffer(hsmem,g_pos_send.size()); @@ -894,7 +918,7 @@ class vector_dist_comm size_t offset = prc_sz.template get<0>(0); - // Fill the sending fuffers + // Fill the sending buffers for (size_t i = 0 ; i < m_pos.size() ; i++) { auto ite = m_pos.get(i).getGPUIterator(); @@ -1074,6 +1098,7 @@ class vector_dist_comm <<<ite.wthr,ite.thr>>> (dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank()); + #ifndef TEST1 // sort particles @@ -1123,9 +1148,10 @@ class vector_dist_comm { // reset lbl_p lbl_p.clear(); + prc_sz_gg.clear(); o_part_loc.clear(); g_opart.clear(); - g_opart.resize(dec.getNNProcessors()); + prc_g_opart.clear(); // resize the label buffer prc_sz.template fill<0>(0); diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp index e3c332cf..d4f0a883 100644 --- a/src/Vector/vector_dist_kernel.hpp +++ b/src/Vector/vector_dist_kernel.hpp @@ -12,7 +12,7 @@ #define POS_PROP -1 -#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x > vd.size()) {return;}; +#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= vd.size()) {return;}; #define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\ else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);} -- GitLab