From 11a0f824c111e686ad2388cec485ffcaafab5560 Mon Sep 17 00:00:00 2001 From: Pietro Incardona Date: Fri, 24 Aug 2018 03:11:40 +0200 Subject: [PATCH] Changes to make it work with GPU Direct --- src/VCluster/VCluster.cpp | 3 +- src/VCluster/VCluster.hpp | 75 +++++++---- src/VCluster/VCluster_base.hpp | 6 + src/VCluster/VCluster_meta_function.hpp | 45 +++---- src/VCluster/VCluster_semantic_unit_tests.hpp | 6 - .../cuda/VCluster_semantic_unit_cuda_tests.cu | 117 +----------------- .../VCluster_semantic_unit_tests_funcs.hpp | 116 ++++++++++++++++- 7 files changed, 203 insertions(+), 165 deletions(-) diff --git a/src/VCluster/VCluster.cpp b/src/VCluster/VCluster.cpp index 3521fbc..bba955a 100644 --- a/src/VCluster/VCluster.cpp +++ b/src/VCluster/VCluster.cpp @@ -8,7 +8,8 @@ #include "util/print_stack.hpp" #include "util/math_util_complex.hpp" -Vcluster<> * global_v_cluster_private = NULL; +Vcluster<> * global_v_cluster_private_heap = NULL; +Vcluster * global_v_cluster_private_cuda = NULL; // std::vector sieve_spf; diff --git a/src/VCluster/VCluster.hpp b/src/VCluster/VCluster.hpp index 7915d93..b5a74b4 100644 --- a/src/VCluster/VCluster.hpp +++ b/src/VCluster/VCluster.hpp @@ -55,6 +55,14 @@ class Vcluster: public Vcluster_base inline static void process_recv(Vcluster & vcl, S & recv, openfpm::vector * sz_recv, openfpm::vector * sz_recv_byte, op & op_param,size_t opt) { + if (opt == MPI_GPU_DIRECT && !std::is_same::value) + { + // In order to have this option activated InternalMemory must be CudaMemory + + std::cout << __FILE__ << ":" << __LINE__ << " error: in order to have MPI_GPU_DIRECT VCluster must use CudaMemory internally, the most probable" << + " cause of this problem is that you are using MPI_GPU_DIRECT option with a non-GPU data-structure" << std::endl; + } + vcl.process_receive_buffer_with_prp(recv,sz_recv,sz_recv_byte,op_param,opt); } }; @@ -131,7 +139,7 @@ class Vcluster: public Vcluster_base self_base::tags.clear(); // receive information - base_info bi(&this->recv_buf,prc_recv,sz_recv_byte,this->tags,opt); + base_info bi(&this->recv_buf,prc_recv,sz_recv_byte,this->tags,opt); // Send and recv multiple messages if (opt & RECEIVE_KNOWN) @@ -191,10 +199,11 @@ class Vcluster: public Vcluster_base * \param size of the received data * */ + template struct base_info { //! Receive buffer - openfpm::vector> * recv_buf; + openfpm::vector> * recv_buf; //! receiving processor list openfpm::vector & prc; //! size of each message @@ -206,7 +215,7 @@ class Vcluster: public Vcluster_base size_t opt; //! constructor - base_info(openfpm::vector> * recv_buf, openfpm::vector & prc, openfpm::vector & sz, openfpm::vector & tags,size_t opt) + base_info(openfpm::vector> * recv_buf, openfpm::vector & prc, openfpm::vector & sz, openfpm::vector & tags,size_t opt) :recv_buf(recv_buf),prc(prc),sz(sz),tags(tags),opt(opt) {} }; @@ -226,7 +235,7 @@ class Vcluster: public Vcluster_base */ static void * msg_alloc(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr) { - base_info & rinfo = *(base_info *)ptr; + base_info & rinfo = *(base_info *)ptr; if (rinfo.recv_buf == NULL) { @@ -249,7 +258,7 @@ class Vcluster: public Vcluster_base if (rinfo.opt & MPI_GPU_DIRECT) { #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT - return rinfo.recv_buf->last().getDevicePointer(); + return rinfo.recv_buf->last().getDevicePointerNoCopy(); #else return rinfo.recv_buf->last().getPointer(); #endif @@ -274,7 +283,7 @@ class Vcluster: public Vcluster_base */ static void * msg_alloc_known(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr) { - base_info & rinfo = *(base_info *)ptr; + base_info & rinfo = *(base_info *)ptr; if (rinfo.recv_buf == NULL) { @@ -425,7 +434,7 @@ class Vcluster: public Vcluster_base self_base::tags.clear(); // receive information - base_info bi(&this->recv_buf,prc,sz,this->tags,0); + base_info bi(&this->recv_buf,prc,sz,this->tags,0); // Send and recv multiple messages self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi); @@ -479,7 +488,7 @@ class Vcluster: public Vcluster_base self_base::tags.clear(); // receive information - base_info bi(NULL,prc,sz,self_base::tags,0); + base_info bi(NULL,prc,sz,self_base::tags,0); // Send and recv multiple messages self_base::sendrecvMultipleMessagesNBX(send_prc_.size(),(size_t *)sz.getPointer(),(size_t *)send_prc_.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi,NONE); @@ -544,7 +553,7 @@ class Vcluster: public Vcluster_base self_base::tags.clear(); // receive information - base_info bi(&this->recv_buf,prc,sz,this->tags,0); + base_info bi(&this->recv_buf,prc,sz,this->tags,0); // Send and recv multiple messages self_base::sendrecvMultipleMessagesNBX(prc.size(),(size_t *)sz_byte.getPointer(),(size_t *)prc.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi); @@ -565,7 +574,7 @@ class Vcluster: public Vcluster_base self_base::tags.clear(); // receive information - base_info bi(&this->recv_buf,prc,sz,this->tags,0); + base_info bi(&this->recv_buf,prc,sz,this->tags,0); // Send and recv multiple messages self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi); @@ -630,7 +639,7 @@ class Vcluster: public Vcluster_base // we sort based on processor rcv.sort(); - openfpm::vector> recv_ord; + openfpm::vector> recv_ord; recv_ord.resize(rcv.size()); openfpm::vector prc_ord; @@ -747,7 +756,7 @@ class Vcluster: public Vcluster_base op_ssend_recv_add opa; // process the received information - process_receive_buffer_with_prp,T,S,layout_base,prp...>(recv,&sz_recv,&sz_recv_byte,opa); + process_receive_buffer_with_prp,T,S,layout_base,prp...>(recv,&sz_recv,&sz_recv_byte,opa,opt); return true; } @@ -851,7 +860,7 @@ class Vcluster: public Vcluster_base prepare_send_buffer(send,recv,prc_send,prc_recv,recv_sz,opt); // process the received information - process_receive_buffer_with_prp(recv,NULL,NULL,op_param); + process_receive_buffer_with_prp(recv,NULL,NULL,op_param,opt); return true; } @@ -862,7 +871,8 @@ class Vcluster: public Vcluster_base // Function to initialize the global VCluster // -extern Vcluster<> * global_v_cluster_private; +extern Vcluster<> * global_v_cluster_private_heap; +extern Vcluster * global_v_cluster_private_cuda; /*! \brief Initialize a global instance of Runtime Virtual Cluster Machine * @@ -872,25 +882,44 @@ extern Vcluster<> * global_v_cluster_private; static inline void init_global_v_cluster_private(int *argc, char ***argv) { - if (global_v_cluster_private == NULL) - {global_v_cluster_private = new Vcluster<>(argc,argv);} + if (global_v_cluster_private_heap == NULL) + {global_v_cluster_private_heap = new Vcluster<>(argc,argv);} + + if (global_v_cluster_private_cuda == NULL) + {global_v_cluster_private_cuda = new Vcluster(argc,argv);} } static inline void delete_global_v_cluster_private() { - delete global_v_cluster_private; + delete global_v_cluster_private_heap; + delete global_v_cluster_private_cuda; } -static inline Vcluster<> & create_vcluster() +template +struct get_vcl { -#ifdef SE_CLASS1 + static Vcluster & get() + { + return *global_v_cluster_private_heap; + } +}; - if (global_v_cluster_private == NULL) - std::cerr << __FILE__ << ":" << __LINE__ << " Error you must call openfpm_init before using any distributed data structures"; +template<> +struct get_vcl +{ + static Vcluster & get() + { + return *global_v_cluster_private_cuda; + } +}; -#endif +template +static inline Vcluster & create_vcluster() +{ + if (global_v_cluster_private_heap == NULL) + {std::cerr << __FILE__ << ":" << __LINE__ << " Error you must call openfpm_init before using any distributed data structures";} - return *global_v_cluster_private; + return get_vcl::get(); } diff --git a/src/VCluster/VCluster_base.hpp b/src/VCluster/VCluster_base.hpp index 96fd48d..b9f54cc 100644 --- a/src/VCluster/VCluster_base.hpp +++ b/src/VCluster/VCluster_base.hpp @@ -353,6 +353,12 @@ public: */ mgpu::standard_context_t & getmgpuContext() { + if (context == NULL) + { + std::cout << __FILE__ << ":" << __LINE__ << " error: it seem that modern gpu context is not initialized." + "Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC" << std::endl; + } + return *context; } diff --git a/src/VCluster/VCluster_meta_function.hpp b/src/VCluster/VCluster_meta_function.hpp index 04f0b42..80baa15 100644 --- a/src/VCluster/VCluster_meta_function.hpp +++ b/src/VCluster/VCluster_meta_function.hpp @@ -11,13 +11,13 @@ #include "memory/BHeapMemory.hpp" #include "Packer_Unpacker/has_max_prop.hpp" -template class layout_base> +template class layout_base, typename Memory> struct unpack_selector_with_prp { template static void call_unpack(S & recv, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, openfpm::vector * sz, openfpm::vector * sz_byte, op & op_param, @@ -134,14 +134,14 @@ struct unpack_each_prop_buffer * */ -template class layout_base> +template class layout_base,typename Memory> struct process_receive_mem_traits_inte { //! set of pointers size_t i; //! Receive buffer - openfpm::vector> & recv_buf; + openfpm::vector> & recv_buf; //! Fake vector that map over received memory openfpm::vector::type,layout_base,openfpm::grow_policy_identity> & v2; @@ -157,7 +157,7 @@ struct process_receive_mem_traits_inte * */ inline process_receive_mem_traits_inte(openfpm::vector::type,layout_base,openfpm::grow_policy_identity> & v2, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, size_t i, size_t opt) :i(i),recv_buf(recv_buf),v2(v2),opt(opt) @@ -178,7 +178,7 @@ struct process_receive_mem_traits_inte { #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT // add the received particles to the vector - ptr1 = new PtrMemory(recv_buf.get(i).getDevicePointer(),recv_buf.get(i).size()); + ptr1 = new PtrMemory(recv_buf.get(i).getDevicePointerNoCopy(),recv_buf.get(i).size()); #else // add the received particles to the vector ptr1 = new PtrMemory(recv_buf.get(i).getPointer(),recv_buf.get(i).size()); @@ -196,11 +196,11 @@ struct process_receive_mem_traits_inte } }; -template class layout_base> +template class layout_base,typename Memory> struct unpack_selector_with_prp_lin { template static int call_unpack_impl(S & recv, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, openfpm::vector * sz, openfpm::vector * sz_byte, op & op_param, @@ -210,7 +210,7 @@ struct unpack_selector_with_prp_lin // create vector representation to a piece of memory already allocated openfpm::vector::type,layout_base,openfpm::grow_policy_identity> v2; - process_receive_mem_traits_inte prmti(v2,recv_buf,i,opt); + process_receive_mem_traits_inte prmti(v2,recv_buf,i,opt); boost::mpl::for_each_ref>(prmti); @@ -233,11 +233,11 @@ struct unpack_selector_with_prp_lin } }; -template class layout_base> -struct unpack_selector_with_prp_lin +template class layout_base, typename Memory> +struct unpack_selector_with_prp_lin { template static int call_unpack_impl(S & recv, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, openfpm::vector * sz, openfpm::vector * sz_byte, op & op_param, @@ -278,11 +278,11 @@ struct unpack_selector_with_prp_lin typedef aggregate dummy_type; // -template class layout_base> -struct unpack_selector_with_prp +template class layout_base, typename Memory> +struct unpack_selector_with_prp { template static void call_unpack(S & recv, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, openfpm::vector * sz, openfpm::vector * sz_byte, op & op_param, @@ -293,7 +293,7 @@ struct unpack_selector_with_prp for (size_t i = 0 ; i < recv_buf.size() ; ) { - i += unpack_selector_with_prp_lin>::value,T,S,layout_base>::template call_unpack_impl(recv,recv_buf,sz,sz_byte,op_param,i,opt); + i += unpack_selector_with_prp_lin>::value,T,S,layout_base,Memory>::template call_unpack_impl(recv,recv_buf,sz,sz_byte,op_param,i,opt); } } }; @@ -315,9 +315,9 @@ struct call_serialize_variadic> Packer::template pack(mem,send,sts); } - template class layout_base> + template class layout_base, typename Memory> inline static void call_unpack(S & recv, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, openfpm::vector * sz, openfpm::vector * sz_byte, op & op_param, @@ -325,7 +325,7 @@ struct call_serialize_variadic> { const bool result = has_pack_gen::value == false && is_vector::value == true; - unpack_selector_with_prp::template call_unpack(recv, recv_buf, sz, sz_byte, op_param,opt); + unpack_selector_with_prp::template call_unpack(recv, recv_buf, sz, sz_byte, op_param,opt); } }; @@ -505,8 +505,9 @@ struct pack_unpack_cond_with_prp } } + template static void unpacking(S & recv, - openfpm::vector> & recv_buf, + openfpm::vector> & recv_buf, openfpm::vector * sz, openfpm::vector * sz_byte, op & op_param, @@ -673,7 +674,7 @@ struct op_ssend_recv_merge typename S, template class layout_base, int ... prp> - void execute(D & recv,S & v2,size_t i) + void execute(D & recv,S & v2,size_t i,size_t opt) { op_ssend_recv_merge_impl::template execute(recv,v2,i,opart); } @@ -739,7 +740,7 @@ struct op_ssend_gg_recv_merge {} //! execute the merge - template class layout_base, int ... prp> void execute(D & recv,S & v2,size_t i) + template class layout_base, int ... prp> void execute(D & recv,S & v2,size_t i,size_t opt) { op_ssend_gg_recv_merge_impl::template execute(recv,v2,i,start); } diff --git a/src/VCluster/VCluster_semantic_unit_tests.hpp b/src/VCluster/VCluster_semantic_unit_tests.hpp index 0ad6843..8e3e6bd 100644 --- a/src/VCluster/VCluster_semantic_unit_tests.hpp +++ b/src/VCluster/VCluster_semantic_unit_tests.hpp @@ -1633,12 +1633,6 @@ BOOST_AUTO_TEST_CASE (Vcluster_semantic_sendrecv_6) } -BOOST_AUTO_TEST_CASE( Vcluster_semantic_ssend_recv_layout_switch ) -{ - test_ssend_recv_layout_switch(0); -} - - BOOST_AUTO_TEST_SUITE_END() #endif /* OPENFPM_VCLUSTER_SRC_VCLUSTER_SEMANTIC_UNIT_TESTS_HPP_ */ diff --git a/src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu b/src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu index b6f4a88..c73353d 100644 --- a/src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu +++ b/src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu @@ -4,125 +4,18 @@ #include "VCluster/VCluster.hpp" #include "VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp" -void test_ssend_recv_layout_switch(size_t opt) -{ - auto & v_cl = create_vcluster(); - - if (v_cl.size() > 10) {return;} - - openfpm::vector>> vd; - openfpm::vector_gpu> collect; - openfpm::vector_gpu> collect2; - openfpm::vector prc_send; - openfpm::vector prc_recv; - openfpm::vector sz_recv; - - vd.resize(v_cl.size()); - - for (size_t i = 0 ; i < vd.size() ; i++) - { - vd.get(i).resize(100); - - for (size_t j = 0 ; j < vd.get(i).size() ; j++) - { - vd.get(i).template get<0>(j) = 10000*i + v_cl.rank()*100 + j; - - vd.get(i).template get<1>(j)[0] = 400000 + 10000*i + v_cl.rank()*100 + j; - vd.get(i).template get<1>(j)[1] = 400000 + 10000*i + v_cl.rank()*100 + j; - vd.get(i).template get<1>(j)[2] = 400000 + 10000*i + v_cl.rank()*100 + j; - } - - prc_send.add(i); - - if (opt & MPI_GPU_DIRECT) - { - vd.get(i).template hostToDevice<0,1>(); - - // Reset host - - for (size_t j = 0 ; j < vd.get(i).size() ; j++) - { - vd.get(i).template get<0>(j) = 0.0; - - vd.get(i).template get<1>(j)[0] = 0.0; - vd.get(i).template get<1>(j)[1] = 0.0; - vd.get(i).template get<1>(j)[2] = 0.0; - } - } - } - - v_cl.SSendRecv>,decltype(collect),memory_traits_inte> - (vd,collect,prc_send, prc_recv,sz_recv,opt); - - v_cl.SSendRecvP>,decltype(collect),memory_traits_inte,0,1> - (vd,collect2,prc_send, prc_recv,sz_recv,opt); - - // collect must have 100 * v_cl.size() - - BOOST_REQUIRE_EQUAL(collect.size(),100*v_cl.size()); - BOOST_REQUIRE_EQUAL(collect2.size(),100*v_cl.size()); - // we reset the host collected data if data must be on device - if (opt & MPI_GPU_DIRECT) - { - for (size_t j = 0 ; j < collect.size() ; j++) - { - collect.template get<0>(j) = 0.0; - - collect.template get<1>(j)[0] = 0.0; - collect.template get<1>(j)[1] = 0.0; - collect.template get<1>(j)[2] = 0.0; - - collect2.template get<0>(j) = 0.0; - - collect2.template get<1>(j)[0] = 0.0; - collect2.template get<1>(j)[1] = 0.0; - collect2.template get<1>(j)[2] = 0.0; - } - } - - // from device to host - - if (opt & MPI_GPU_DIRECT) - { - collect.template deviceToHost<0,1>(); - collect2.template deviceToHost<0,1>(); - } - - // now we check what we received - - // check what we received - - bool match = true; - for (size_t i = 0 ; i < v_cl.size() ; i++) - { - for (size_t j = 0 ; j < 100 ; j++) - { - match &= collect.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j; - - match &= collect.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j; - match &= collect.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j; - match &= collect.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j; - - match &= collect2.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j; - - match &= collect2.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j; - match &= collect2.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j; - match &= collect2.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j; - } - - if (match == false){break;} - } +BOOST_AUTO_TEST_SUITE( VCluster_cuda_tests ) - BOOST_REQUIRE_EQUAL(match,true); +BOOST_AUTO_TEST_CASE( Vcluster_semantic_ssend_recv_layout_switch ) +{ + test_ssend_recv_layout_switch(0); } -BOOST_AUTO_TEST_SUITE( VCluster_cuda_tests ) - BOOST_AUTO_TEST_CASE( Vcluster_semantic_gpu_direct ) { - test_ssend_recv_layout_switch(MPI_GPU_DIRECT); + test_ssend_recv_layout_switch(MPI_GPU_DIRECT); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp b/src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp index a0a4733..abba80f 100644 --- a/src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp +++ b/src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp @@ -8,7 +8,121 @@ #ifndef VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_ #define VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_ -void test_ssend_recv_layout_switch(size_t opt); +template +void test_ssend_recv_layout_switch(size_t opt) +{ + auto & v_cl = create_vcluster(); + if (v_cl.size() > 10) {return;} + + openfpm::vector>> vd; + openfpm::vector_gpu> collect; + openfpm::vector_gpu> collect2; + openfpm::vector prc_send; + openfpm::vector prc_recv; + openfpm::vector sz_recv; + + vd.resize(v_cl.size()); + + for (size_t i = 0 ; i < vd.size() ; i++) + { + vd.get(i).resize(100); + + for (size_t j = 0 ; j < vd.get(i).size() ; j++) + { + vd.get(i).template get<0>(j) = 10000*i + v_cl.rank()*100 + j; + + vd.get(i).template get<1>(j)[0] = 400000 + 10000*i + v_cl.rank()*100 + j; + vd.get(i).template get<1>(j)[1] = 400000 + 10000*i + v_cl.rank()*100 + j; + vd.get(i).template get<1>(j)[2] = 400000 + 10000*i + v_cl.rank()*100 + j; + } + + prc_send.add(i); + + if (opt & MPI_GPU_DIRECT) + { + vd.get(i).template hostToDevice<0,1>(); + + // Reset host + + for (size_t j = 0 ; j < vd.get(i).size() ; j++) + { + vd.get(i).template get<0>(j) = 0.0; + + vd.get(i).template get<1>(j)[0] = 0.0; + vd.get(i).template get<1>(j)[1] = 0.0; + vd.get(i).template get<1>(j)[2] = 0.0; + } + } + } + + v_cl.template SSendRecv>,decltype(collect),memory_traits_inte> + (vd,collect,prc_send, prc_recv,sz_recv,opt); + + v_cl.template SSendRecvP>,decltype(collect),memory_traits_inte,0,1> + (vd,collect2,prc_send, prc_recv,sz_recv,opt); + + // collect must have 100 * v_cl.size() + + BOOST_REQUIRE_EQUAL(collect.size(),100*v_cl.size()); + BOOST_REQUIRE_EQUAL(collect2.size(),100*v_cl.size()); + + // we reset the host collected data if data must be on device + + if (opt & MPI_GPU_DIRECT) + { + for (size_t j = 0 ; j < collect.size() ; j++) + { + collect.template get<0>(j) = 0.0; + + collect.template get<1>(j)[0] = 0.0; + collect.template get<1>(j)[1] = 0.0; + collect.template get<1>(j)[2] = 0.0; + + collect2.template get<0>(j) = 0.0; + + collect2.template get<1>(j)[0] = 0.0; + collect2.template get<1>(j)[1] = 0.0; + collect2.template get<1>(j)[2] = 0.0; + } + } + + // from device to host + + if (opt & MPI_GPU_DIRECT) + { + collect.template deviceToHost<0,1>(); + collect2.template deviceToHost<0,1>(); + } + + // now we check what we received + + // check what we received + + bool match = true; + for (size_t i = 0 ; i < v_cl.size() ; i++) + { + for (size_t j = 0 ; j < 100 ; j++) + { + match &= collect.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j; + + std::cout << "COLLECT: " << collect.template get<0>(i*100 +j) << std::endl; + + match &= collect.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j; + match &= collect.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j; + match &= collect.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j; + + match &= collect2.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j; + + match &= collect2.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j; + match &= collect2.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j; + match &= collect2.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j; + } + + if (match == false){break;} + } + + BOOST_REQUIRE_EQUAL(match,true); +} #endif /* VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_ */ -- GitLab