Commit 11a0f824 authored by incardon's avatar incardon

Changes to make it work with GPU Direct

parent ec95a342
......@@ -8,7 +8,8 @@
#include "util/print_stack.hpp"
#include "util/math_util_complex.hpp"
Vcluster<> * global_v_cluster_private = NULL;
Vcluster<> * global_v_cluster_private_heap = NULL;
Vcluster<CudaMemory> * global_v_cluster_private_cuda = NULL;
//
std::vector<int> sieve_spf;
......
......@@ -55,6 +55,14 @@ class Vcluster: public Vcluster_base<InternalMemory>
inline static void process_recv(Vcluster & vcl, S & recv, openfpm::vector<size_t> * sz_recv,
openfpm::vector<size_t> * sz_recv_byte, op & op_param,size_t opt)
{
if (opt == MPI_GPU_DIRECT && !std::is_same<InternalMemory,CudaMemory>::value)
{
// In order to have this option activated InternalMemory must be CudaMemory
std::cout << __FILE__ << ":" << __LINE__ << " error: in order to have MPI_GPU_DIRECT VCluster must use CudaMemory internally, the most probable" <<
" cause of this problem is that you are using MPI_GPU_DIRECT option with a non-GPU data-structure" << std::endl;
}
vcl.process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,sz_recv,sz_recv_byte,op_param,opt);
}
};
......@@ -131,7 +139,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear();
// receive information
base_info bi(&this->recv_buf,prc_recv,sz_recv_byte,this->tags,opt);
base_info<InternalMemory> bi(&this->recv_buf,prc_recv,sz_recv_byte,this->tags,opt);
// Send and recv multiple messages
if (opt & RECEIVE_KNOWN)
......@@ -191,10 +199,11 @@ class Vcluster: public Vcluster_base<InternalMemory>
* \param size of the received data
*
*/
template<typename Memory>
struct base_info
{
//! Receive buffer
openfpm::vector<BMemory<HeapMemory>> * recv_buf;
openfpm::vector<BMemory<Memory>> * recv_buf;
//! receiving processor list
openfpm::vector<size_t> & prc;
//! size of each message
......@@ -206,7 +215,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
size_t opt;
//! constructor
base_info(openfpm::vector<BMemory<HeapMemory>> * recv_buf, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & sz, openfpm::vector<size_t> & tags,size_t opt)
base_info(openfpm::vector<BMemory<Memory>> * recv_buf, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & sz, openfpm::vector<size_t> & tags,size_t opt)
:recv_buf(recv_buf),prc(prc),sz(sz),tags(tags),opt(opt)
{}
};
......@@ -226,7 +235,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
*/
static void * msg_alloc(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr)
{
base_info & rinfo = *(base_info *)ptr;
base_info<InternalMemory> & rinfo = *(base_info<InternalMemory> *)ptr;
if (rinfo.recv_buf == NULL)
{
......@@ -249,7 +258,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
if (rinfo.opt & MPI_GPU_DIRECT)
{
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
return rinfo.recv_buf->last().getDevicePointer();
return rinfo.recv_buf->last().getDevicePointerNoCopy();
#else
return rinfo.recv_buf->last().getPointer();
#endif
......@@ -274,7 +283,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
*/
static void * msg_alloc_known(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr)
{
base_info & rinfo = *(base_info *)ptr;
base_info<InternalMemory> & rinfo = *(base_info<InternalMemory> *)ptr;
if (rinfo.recv_buf == NULL)
{
......@@ -425,7 +434,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear();
// receive information
base_info bi(&this->recv_buf,prc,sz,this->tags,0);
base_info<InternalMemory> bi(&this->recv_buf,prc,sz,this->tags,0);
// Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi);
......@@ -479,7 +488,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear();
// receive information
base_info bi(NULL,prc,sz,self_base::tags,0);
base_info<InternalMemory> bi(NULL,prc,sz,self_base::tags,0);
// Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(send_prc_.size(),(size_t *)sz.getPointer(),(size_t *)send_prc_.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi,NONE);
......@@ -544,7 +553,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear();
// receive information
base_info bi(&this->recv_buf,prc,sz,this->tags,0);
base_info<InternalMemory> bi(&this->recv_buf,prc,sz,this->tags,0);
// Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(prc.size(),(size_t *)sz_byte.getPointer(),(size_t *)prc.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi);
......@@ -565,7 +574,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear();
// receive information
base_info bi(&this->recv_buf,prc,sz,this->tags,0);
base_info<InternalMemory> bi(&this->recv_buf,prc,sz,this->tags,0);
// Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi);
......@@ -630,7 +639,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
// we sort based on processor
rcv.sort();
openfpm::vector<BMemory<HeapMemory>> recv_ord;
openfpm::vector<BMemory<InternalMemory>> recv_ord;
recv_ord.resize(rcv.size());
openfpm::vector<size_t> prc_ord;
......@@ -747,7 +756,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
op_ssend_recv_add<void> opa;
// process the received information
process_receive_buffer_with_prp<op_ssend_recv_add<void>,T,S,layout_base,prp...>(recv,&sz_recv,&sz_recv_byte,opa);
process_receive_buffer_with_prp<op_ssend_recv_add<void>,T,S,layout_base,prp...>(recv,&sz_recv,&sz_recv_byte,opa,opt);
return true;
}
......@@ -851,7 +860,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
prepare_send_buffer<op,T,S,layout_base>(send,recv,prc_send,prc_recv,recv_sz,opt);
// process the received information
process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,NULL,NULL,op_param);
process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,NULL,NULL,op_param,opt);
return true;
}
......@@ -862,7 +871,8 @@ class Vcluster: public Vcluster_base<InternalMemory>
// Function to initialize the global VCluster //
extern Vcluster<> * global_v_cluster_private;
extern Vcluster<> * global_v_cluster_private_heap;
extern Vcluster<CudaMemory> * global_v_cluster_private_cuda;
/*! \brief Initialize a global instance of Runtime Virtual Cluster Machine
*
......@@ -872,25 +882,44 @@ extern Vcluster<> * global_v_cluster_private;
static inline void init_global_v_cluster_private(int *argc, char ***argv)
{
if (global_v_cluster_private == NULL)
{global_v_cluster_private = new Vcluster<>(argc,argv);}
if (global_v_cluster_private_heap == NULL)
{global_v_cluster_private_heap = new Vcluster<>(argc,argv);}
if (global_v_cluster_private_cuda == NULL)
{global_v_cluster_private_cuda = new Vcluster<CudaMemory>(argc,argv);}
}
static inline void delete_global_v_cluster_private()
{
delete global_v_cluster_private;
delete global_v_cluster_private_heap;
delete global_v_cluster_private_cuda;
}
static inline Vcluster<> & create_vcluster()
template<typename Memory>
struct get_vcl
{
#ifdef SE_CLASS1
static Vcluster<Memory> & get()
{
return *global_v_cluster_private_heap;
}
};
if (global_v_cluster_private == NULL)
std::cerr << __FILE__ << ":" << __LINE__ << " Error you must call openfpm_init before using any distributed data structures";
template<>
struct get_vcl<CudaMemory>
{
static Vcluster<CudaMemory> & get()
{
return *global_v_cluster_private_cuda;
}
};
#endif
template<typename Memory = HeapMemory>
static inline Vcluster<Memory> & create_vcluster()
{
if (global_v_cluster_private_heap == NULL)
{std::cerr << __FILE__ << ":" << __LINE__ << " Error you must call openfpm_init before using any distributed data structures";}
return *global_v_cluster_private;
return get_vcl<Memory>::get();
}
......
......@@ -353,6 +353,12 @@ public:
*/
mgpu::standard_context_t & getmgpuContext()
{
if (context == NULL)
{
std::cout << __FILE__ << ":" << __LINE__ << " error: it seem that modern gpu context is not initialized."
"Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC" << std::endl;
}
return *context;
}
......
......@@ -11,13 +11,13 @@
#include "memory/BHeapMemory.hpp"
#include "Packer_Unpacker/has_max_prop.hpp"
template<bool result, typename T, typename S, template<typename> class layout_base>
template<bool result, typename T, typename S, template<typename> class layout_base, typename Memory>
struct unpack_selector_with_prp
{
template<typename op,
int ... prp>
static void call_unpack(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte,
op & op_param,
......@@ -134,14 +134,14 @@ struct unpack_each_prop_buffer
*
*/
template<typename sT, template<typename> class layout_base>
template<typename sT, template<typename> class layout_base,typename Memory>
struct process_receive_mem_traits_inte
{
//! set of pointers
size_t i;
//! Receive buffer
openfpm::vector<BMemory<HeapMemory>> & recv_buf;
openfpm::vector<BMemory<Memory>> & recv_buf;
//! Fake vector that map over received memory
openfpm::vector<typename sT::value_type,PtrMemory,typename layout_base<typename sT::value_type>::type,layout_base,openfpm::grow_policy_identity> & v2;
......@@ -157,7 +157,7 @@ struct process_receive_mem_traits_inte
*
*/
inline process_receive_mem_traits_inte(openfpm::vector<typename sT::value_type,PtrMemory,typename layout_base<typename sT::value_type>::type,layout_base,openfpm::grow_policy_identity> & v2,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
size_t i,
size_t opt)
:i(i),recv_buf(recv_buf),v2(v2),opt(opt)
......@@ -178,7 +178,7 @@ struct process_receive_mem_traits_inte
{
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
// add the received particles to the vector
ptr1 = new PtrMemory(recv_buf.get(i).getDevicePointer(),recv_buf.get(i).size());
ptr1 = new PtrMemory(recv_buf.get(i).getDevicePointerNoCopy(),recv_buf.get(i).size());
#else
// add the received particles to the vector
ptr1 = new PtrMemory(recv_buf.get(i).getPointer(),recv_buf.get(i).size());
......@@ -196,11 +196,11 @@ struct process_receive_mem_traits_inte
}
};
template<bool inte_or_lin,typename T, typename S, template<typename> class layout_base>
template<bool inte_or_lin,typename T, typename S, template<typename> class layout_base,typename Memory>
struct unpack_selector_with_prp_lin
{
template<typename op, unsigned int ... prp> static int call_unpack_impl(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte,
op & op_param,
......@@ -210,7 +210,7 @@ struct unpack_selector_with_prp_lin
// create vector representation to a piece of memory already allocated
openfpm::vector<typename T::value_type,PtrMemory,typename layout_base<typename T::value_type>::type,layout_base,openfpm::grow_policy_identity> v2;
process_receive_mem_traits_inte<T,layout_base> prmti(v2,recv_buf,i,opt);
process_receive_mem_traits_inte<T,layout_base,Memory> prmti(v2,recv_buf,i,opt);
boost::mpl::for_each_ref<boost::mpl::range_c<int,0,T::value_type::max_prop>>(prmti);
......@@ -233,11 +233,11 @@ struct unpack_selector_with_prp_lin
}
};
template<typename T, typename S, template<typename> class layout_base>
struct unpack_selector_with_prp_lin<true,T,S,layout_base>
template<typename T, typename S, template<typename> class layout_base, typename Memory>
struct unpack_selector_with_prp_lin<true,T,S,layout_base,Memory>
{
template<typename op, unsigned int ... prp> static int call_unpack_impl(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte,
op & op_param,
......@@ -278,11 +278,11 @@ struct unpack_selector_with_prp_lin<true,T,S,layout_base>
typedef aggregate<int,int> dummy_type;
//
template<typename T, typename S, template<typename> class layout_base>
struct unpack_selector_with_prp<true,T,S,layout_base>
template<typename T, typename S, template<typename> class layout_base, typename Memory>
struct unpack_selector_with_prp<true,T,S,layout_base,Memory>
{
template<typename op, unsigned int ... prp> static void call_unpack(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte,
op & op_param,
......@@ -293,7 +293,7 @@ struct unpack_selector_with_prp<true,T,S,layout_base>
for (size_t i = 0 ; i < recv_buf.size() ; )
{
i += unpack_selector_with_prp_lin<is_layout_mlin<layout_base<dummy_type>>::value,T,S,layout_base>::template call_unpack_impl<op,prp...>(recv,recv_buf,sz,sz_byte,op_param,i,opt);
i += unpack_selector_with_prp_lin<is_layout_mlin<layout_base<dummy_type>>::value,T,S,layout_base,Memory>::template call_unpack_impl<op,prp...>(recv,recv_buf,sz,sz_byte,op_param,i,opt);
}
}
};
......@@ -315,9 +315,9 @@ struct call_serialize_variadic<index_tuple<prp...>>
Packer<T,HeapMemory>::template pack<prp...>(mem,send,sts);
}
template<typename op, typename T, typename S, template<typename> class layout_base>
template<typename op, typename T, typename S, template<typename> class layout_base, typename Memory>
inline static void call_unpack(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte,
op & op_param,
......@@ -325,7 +325,7 @@ struct call_serialize_variadic<index_tuple<prp...>>
{
const bool result = has_pack_gen<typename T::value_type>::value == false && is_vector<T>::value == true;
unpack_selector_with_prp<result, T, S,layout_base>::template call_unpack<op,prp...>(recv, recv_buf, sz, sz_byte, op_param,opt);
unpack_selector_with_prp<result, T, S,layout_base,Memory>::template call_unpack<op,prp...>(recv, recv_buf, sz, sz_byte, op_param,opt);
}
};
......@@ -505,8 +505,9 @@ struct pack_unpack_cond_with_prp
}
}
template<typename Memory>
static void unpacking(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf,
openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte,
op & op_param,
......@@ -673,7 +674,7 @@ struct op_ssend_recv_merge
typename S,
template <typename> class layout_base,
int ... prp>
void execute(D & recv,S & v2,size_t i)
void execute(D & recv,S & v2,size_t i,size_t opt)
{
op_ssend_recv_merge_impl<sr,op>::template execute<T,D,S,layout_base,prp...>(recv,v2,i,opart);
}
......@@ -739,7 +740,7 @@ struct op_ssend_gg_recv_merge
{}
//! execute the merge
template<bool sr, typename T, typename D, typename S, template<typename> class layout_base, int ... prp> void execute(D & recv,S & v2,size_t i)
template<bool sr, typename T, typename D, typename S, template<typename> class layout_base, int ... prp> void execute(D & recv,S & v2,size_t i,size_t opt)
{
op_ssend_gg_recv_merge_impl<sr>::template execute<T,D,S,layout_base,prp...>(recv,v2,i,start);
}
......
......@@ -1633,12 +1633,6 @@ BOOST_AUTO_TEST_CASE (Vcluster_semantic_sendrecv_6)
}
BOOST_AUTO_TEST_CASE( Vcluster_semantic_ssend_recv_layout_switch )
{
test_ssend_recv_layout_switch(0);
}
BOOST_AUTO_TEST_SUITE_END()
#endif /* OPENFPM_VCLUSTER_SRC_VCLUSTER_SEMANTIC_UNIT_TESTS_HPP_ */
......@@ -4,125 +4,18 @@
#include "VCluster/VCluster.hpp"
#include "VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp"
void test_ssend_recv_layout_switch(size_t opt)
{
auto & v_cl = create_vcluster();
if (v_cl.size() > 10) {return;}
openfpm::vector<openfpm::vector_gpu_single<aggregate<float,float[3]>>> vd;
openfpm::vector_gpu<aggregate<float,float[3]>> collect;
openfpm::vector_gpu<aggregate<float,float[3]>> collect2;
openfpm::vector<size_t> prc_send;
openfpm::vector<size_t> prc_recv;
openfpm::vector<size_t> sz_recv;
vd.resize(v_cl.size());
for (size_t i = 0 ; i < vd.size() ; i++)
{
vd.get(i).resize(100);
for (size_t j = 0 ; j < vd.get(i).size() ; j++)
{
vd.get(i).template get<0>(j) = 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[0] = 400000 + 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[1] = 400000 + 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[2] = 400000 + 10000*i + v_cl.rank()*100 + j;
}
prc_send.add(i);
if (opt & MPI_GPU_DIRECT)
{
vd.get(i).template hostToDevice<0,1>();
// Reset host
for (size_t j = 0 ; j < vd.get(i).size() ; j++)
{
vd.get(i).template get<0>(j) = 0.0;
vd.get(i).template get<1>(j)[0] = 0.0;
vd.get(i).template get<1>(j)[1] = 0.0;
vd.get(i).template get<1>(j)[2] = 0.0;
}
}
}
v_cl.SSendRecv<openfpm::vector_gpu_single<aggregate<float,float[3]>>,decltype(collect),memory_traits_inte>
(vd,collect,prc_send, prc_recv,sz_recv,opt);
v_cl.SSendRecvP<openfpm::vector_gpu_single<aggregate<float,float[3]>>,decltype(collect),memory_traits_inte,0,1>
(vd,collect2,prc_send, prc_recv,sz_recv,opt);
// collect must have 100 * v_cl.size()
BOOST_REQUIRE_EQUAL(collect.size(),100*v_cl.size());
BOOST_REQUIRE_EQUAL(collect2.size(),100*v_cl.size());
// we reset the host collected data if data must be on device
if (opt & MPI_GPU_DIRECT)
{
for (size_t j = 0 ; j < collect.size() ; j++)
{
collect.template get<0>(j) = 0.0;
collect.template get<1>(j)[0] = 0.0;
collect.template get<1>(j)[1] = 0.0;
collect.template get<1>(j)[2] = 0.0;
collect2.template get<0>(j) = 0.0;
collect2.template get<1>(j)[0] = 0.0;
collect2.template get<1>(j)[1] = 0.0;
collect2.template get<1>(j)[2] = 0.0;
}
}
// from device to host
if (opt & MPI_GPU_DIRECT)
{
collect.template deviceToHost<0,1>();
collect2.template deviceToHost<0,1>();
}
// now we check what we received
// check what we received
bool match = true;
for (size_t i = 0 ; i < v_cl.size() ; i++)
{
for (size_t j = 0 ; j < 100 ; j++)
{
match &= collect.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j;
}
if (match == false){break;}
}
BOOST_AUTO_TEST_SUITE( VCluster_cuda_tests )
BOOST_REQUIRE_EQUAL(match,true);
BOOST_AUTO_TEST_CASE( Vcluster_semantic_ssend_recv_layout_switch )
{
test_ssend_recv_layout_switch<HeapMemory>(0);
}
BOOST_AUTO_TEST_SUITE( VCluster_cuda_tests )
BOOST_AUTO_TEST_CASE( Vcluster_semantic_gpu_direct )
{
test_ssend_recv_layout_switch(MPI_GPU_DIRECT);
test_ssend_recv_layout_switch<CudaMemory>(MPI_GPU_DIRECT);
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -8,7 +8,121 @@
#ifndef VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_
#define VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_
void test_ssend_recv_layout_switch(size_t opt);
template<typename Memory>
void test_ssend_recv_layout_switch(size_t opt)
{
auto & v_cl = create_vcluster<Memory>();
if (v_cl.size() > 10) {return;}
openfpm::vector<openfpm::vector_gpu_single<aggregate<float,float[3]>>> vd;
openfpm::vector_gpu<aggregate<float,float[3]>> collect;
openfpm::vector_gpu<aggregate<float,float[3]>> collect2;
openfpm::vector<size_t> prc_send;
openfpm::vector<size_t> prc_recv;
openfpm::vector<size_t> sz_recv;
vd.resize(v_cl.size());
for (size_t i = 0 ; i < vd.size() ; i++)
{
vd.get(i).resize(100);
for (size_t j = 0 ; j < vd.get(i).size() ; j++)
{
vd.get(i).template get<0>(j) = 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[0] = 400000 + 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[1] = 400000 + 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[2] = 400000 + 10000*i + v_cl.rank()*100 + j;
}
prc_send.add(i);
if (opt & MPI_GPU_DIRECT)
{
vd.get(i).template hostToDevice<0,1>();
// Reset host
for (size_t j = 0 ; j < vd.get(i).size() ; j++)
{
vd.get(i).template get<0>(j) = 0.0;
vd.get(i).template get<1>(j)[0] = 0.0;
vd.get(i).template get<1>(j)[1] = 0.0;
vd.get(i).template get<1>(j)[2] = 0.0;
}
}
}
v_cl.template SSendRecv<openfpm::vector_gpu_single<aggregate<float,float[3]>>,decltype(collect),memory_traits_inte>
(vd,collect,prc_send, prc_recv,sz_recv,opt);
v_cl.template SSendRecvP<openfpm::vector_gpu_single<aggregate<float,float[3]>>,decltype(collect),memory_traits_inte,0,1>
(vd,collect2,prc_send, prc_recv,sz_recv,opt);
// collect must have 100 * v_cl.size()
BOOST_REQUIRE_EQUAL(collect.size(),100*v_cl.size());
BOOST_REQUIRE_EQUAL(collect2.size(),100*v_cl.size());
// we reset the host collected data if data must be on device
if (opt & MPI_GPU_DIRECT)
{
for (size_t j = 0 ; j < collect.size() ; j++)
{
collect.template get<0>(j) = 0.0;
collect.template get<1>(j)[0] = 0.0;
collect.template get<1>(j)[1] = 0.0;
collect.template get<1>(j)[2] = 0.0;
collect2.template get<0>(j) = 0.0;
collect2.template get<1>(j)[0] = 0.0;
collect2.template get<1>(j)[1] = 0.0;
collect2.template get<1>(j)[2] = 0.0;
}
}
// from device to host
if (opt & MPI_GPU_DIRECT)
{
collect.template deviceToHost<0,1>();
collect2.template deviceToHost<0,1>();
}
// now we check what we received
// check what we received
bool match = true;
for (size_t i = 0 ; i < v_cl.size() ; i++)
{
for (size_t j = 0 ; j < 100 ; j++)
{
match &= collect.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j;
std::cout << "COLLECT: " << collect.template get<0>(i*100 +j) << std::endl;
match &= collect.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j;
}
if (match == false){break;}
}
BOOST_REQUIRE_EQUAL(match,true);
}
#endif /* VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment