Commit 11a0f824 authored by incardon's avatar incardon

Changes to make it work with GPU Direct

parent ec95a342
...@@ -8,7 +8,8 @@ ...@@ -8,7 +8,8 @@
#include "util/print_stack.hpp" #include "util/print_stack.hpp"
#include "util/math_util_complex.hpp" #include "util/math_util_complex.hpp"
Vcluster<> * global_v_cluster_private = NULL; Vcluster<> * global_v_cluster_private_heap = NULL;
Vcluster<CudaMemory> * global_v_cluster_private_cuda = NULL;
// //
std::vector<int> sieve_spf; std::vector<int> sieve_spf;
......
...@@ -55,6 +55,14 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -55,6 +55,14 @@ class Vcluster: public Vcluster_base<InternalMemory>
inline static void process_recv(Vcluster & vcl, S & recv, openfpm::vector<size_t> * sz_recv, inline static void process_recv(Vcluster & vcl, S & recv, openfpm::vector<size_t> * sz_recv,
openfpm::vector<size_t> * sz_recv_byte, op & op_param,size_t opt) openfpm::vector<size_t> * sz_recv_byte, op & op_param,size_t opt)
{ {
if (opt == MPI_GPU_DIRECT && !std::is_same<InternalMemory,CudaMemory>::value)
{
// In order to have this option activated InternalMemory must be CudaMemory
std::cout << __FILE__ << ":" << __LINE__ << " error: in order to have MPI_GPU_DIRECT VCluster must use CudaMemory internally, the most probable" <<
" cause of this problem is that you are using MPI_GPU_DIRECT option with a non-GPU data-structure" << std::endl;
}
vcl.process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,sz_recv,sz_recv_byte,op_param,opt); vcl.process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,sz_recv,sz_recv_byte,op_param,opt);
} }
}; };
...@@ -131,7 +139,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -131,7 +139,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear(); self_base::tags.clear();
// receive information // receive information
base_info bi(&this->recv_buf,prc_recv,sz_recv_byte,this->tags,opt); base_info<InternalMemory> bi(&this->recv_buf,prc_recv,sz_recv_byte,this->tags,opt);
// Send and recv multiple messages // Send and recv multiple messages
if (opt & RECEIVE_KNOWN) if (opt & RECEIVE_KNOWN)
...@@ -191,10 +199,11 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -191,10 +199,11 @@ class Vcluster: public Vcluster_base<InternalMemory>
* \param size of the received data * \param size of the received data
* *
*/ */
template<typename Memory>
struct base_info struct base_info
{ {
//! Receive buffer //! Receive buffer
openfpm::vector<BMemory<HeapMemory>> * recv_buf; openfpm::vector<BMemory<Memory>> * recv_buf;
//! receiving processor list //! receiving processor list
openfpm::vector<size_t> & prc; openfpm::vector<size_t> & prc;
//! size of each message //! size of each message
...@@ -206,7 +215,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -206,7 +215,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
size_t opt; size_t opt;
//! constructor //! constructor
base_info(openfpm::vector<BMemory<HeapMemory>> * recv_buf, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & sz, openfpm::vector<size_t> & tags,size_t opt) base_info(openfpm::vector<BMemory<Memory>> * recv_buf, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & sz, openfpm::vector<size_t> & tags,size_t opt)
:recv_buf(recv_buf),prc(prc),sz(sz),tags(tags),opt(opt) :recv_buf(recv_buf),prc(prc),sz(sz),tags(tags),opt(opt)
{} {}
}; };
...@@ -226,7 +235,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -226,7 +235,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
*/ */
static void * msg_alloc(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr) static void * msg_alloc(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr)
{ {
base_info & rinfo = *(base_info *)ptr; base_info<InternalMemory> & rinfo = *(base_info<InternalMemory> *)ptr;
if (rinfo.recv_buf == NULL) if (rinfo.recv_buf == NULL)
{ {
...@@ -249,7 +258,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -249,7 +258,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
if (rinfo.opt & MPI_GPU_DIRECT) if (rinfo.opt & MPI_GPU_DIRECT)
{ {
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
return rinfo.recv_buf->last().getDevicePointer(); return rinfo.recv_buf->last().getDevicePointerNoCopy();
#else #else
return rinfo.recv_buf->last().getPointer(); return rinfo.recv_buf->last().getPointer();
#endif #endif
...@@ -274,7 +283,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -274,7 +283,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
*/ */
static void * msg_alloc_known(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr) static void * msg_alloc_known(size_t msg_i ,size_t total_msg, size_t total_p, size_t i, size_t ri, size_t tag, void * ptr)
{ {
base_info & rinfo = *(base_info *)ptr; base_info<InternalMemory> & rinfo = *(base_info<InternalMemory> *)ptr;
if (rinfo.recv_buf == NULL) if (rinfo.recv_buf == NULL)
{ {
...@@ -425,7 +434,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -425,7 +434,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear(); self_base::tags.clear();
// receive information // receive information
base_info bi(&this->recv_buf,prc,sz,this->tags,0); base_info<InternalMemory> bi(&this->recv_buf,prc,sz,this->tags,0);
// Send and recv multiple messages // Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi); self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi);
...@@ -479,7 +488,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -479,7 +488,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear(); self_base::tags.clear();
// receive information // receive information
base_info bi(NULL,prc,sz,self_base::tags,0); base_info<InternalMemory> bi(NULL,prc,sz,self_base::tags,0);
// Send and recv multiple messages // Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(send_prc_.size(),(size_t *)sz.getPointer(),(size_t *)send_prc_.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi,NONE); self_base::sendrecvMultipleMessagesNBX(send_prc_.size(),(size_t *)sz.getPointer(),(size_t *)send_prc_.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi,NONE);
...@@ -544,7 +553,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -544,7 +553,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear(); self_base::tags.clear();
// receive information // receive information
base_info bi(&this->recv_buf,prc,sz,this->tags,0); base_info<InternalMemory> bi(&this->recv_buf,prc,sz,this->tags,0);
// Send and recv multiple messages // Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(prc.size(),(size_t *)sz_byte.getPointer(),(size_t *)prc.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi); self_base::sendrecvMultipleMessagesNBX(prc.size(),(size_t *)sz_byte.getPointer(),(size_t *)prc.getPointer(),(void **)send_buf.getPointer(),msg_alloc,(void *)&bi);
...@@ -565,7 +574,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -565,7 +574,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base::tags.clear(); self_base::tags.clear();
// receive information // receive information
base_info bi(&this->recv_buf,prc,sz,this->tags,0); base_info<InternalMemory> bi(&this->recv_buf,prc,sz,this->tags,0);
// Send and recv multiple messages // Send and recv multiple messages
self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi); self_base::sendrecvMultipleMessagesNBX(send_req.size(),NULL,NULL,NULL,msg_alloc,&bi);
...@@ -630,7 +639,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -630,7 +639,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
// we sort based on processor // we sort based on processor
rcv.sort(); rcv.sort();
openfpm::vector<BMemory<HeapMemory>> recv_ord; openfpm::vector<BMemory<InternalMemory>> recv_ord;
recv_ord.resize(rcv.size()); recv_ord.resize(rcv.size());
openfpm::vector<size_t> prc_ord; openfpm::vector<size_t> prc_ord;
...@@ -747,7 +756,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -747,7 +756,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
op_ssend_recv_add<void> opa; op_ssend_recv_add<void> opa;
// process the received information // process the received information
process_receive_buffer_with_prp<op_ssend_recv_add<void>,T,S,layout_base,prp...>(recv,&sz_recv,&sz_recv_byte,opa); process_receive_buffer_with_prp<op_ssend_recv_add<void>,T,S,layout_base,prp...>(recv,&sz_recv,&sz_recv_byte,opa,opt);
return true; return true;
} }
...@@ -851,7 +860,7 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -851,7 +860,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
prepare_send_buffer<op,T,S,layout_base>(send,recv,prc_send,prc_recv,recv_sz,opt); prepare_send_buffer<op,T,S,layout_base>(send,recv,prc_send,prc_recv,recv_sz,opt);
// process the received information // process the received information
process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,NULL,NULL,op_param); process_receive_buffer_with_prp<op,T,S,layout_base,prp...>(recv,NULL,NULL,op_param,opt);
return true; return true;
} }
...@@ -862,7 +871,8 @@ class Vcluster: public Vcluster_base<InternalMemory> ...@@ -862,7 +871,8 @@ class Vcluster: public Vcluster_base<InternalMemory>
// Function to initialize the global VCluster // // Function to initialize the global VCluster //
extern Vcluster<> * global_v_cluster_private; extern Vcluster<> * global_v_cluster_private_heap;
extern Vcluster<CudaMemory> * global_v_cluster_private_cuda;
/*! \brief Initialize a global instance of Runtime Virtual Cluster Machine /*! \brief Initialize a global instance of Runtime Virtual Cluster Machine
* *
...@@ -872,25 +882,44 @@ extern Vcluster<> * global_v_cluster_private; ...@@ -872,25 +882,44 @@ extern Vcluster<> * global_v_cluster_private;
static inline void init_global_v_cluster_private(int *argc, char ***argv) static inline void init_global_v_cluster_private(int *argc, char ***argv)
{ {
if (global_v_cluster_private == NULL) if (global_v_cluster_private_heap == NULL)
{global_v_cluster_private = new Vcluster<>(argc,argv);} {global_v_cluster_private_heap = new Vcluster<>(argc,argv);}
if (global_v_cluster_private_cuda == NULL)
{global_v_cluster_private_cuda = new Vcluster<CudaMemory>(argc,argv);}
} }
static inline void delete_global_v_cluster_private() static inline void delete_global_v_cluster_private()
{ {
delete global_v_cluster_private; delete global_v_cluster_private_heap;
delete global_v_cluster_private_cuda;
} }
static inline Vcluster<> & create_vcluster() template<typename Memory>
struct get_vcl
{ {
#ifdef SE_CLASS1 static Vcluster<Memory> & get()
{
return *global_v_cluster_private_heap;
}
};
if (global_v_cluster_private == NULL) template<>
std::cerr << __FILE__ << ":" << __LINE__ << " Error you must call openfpm_init before using any distributed data structures"; struct get_vcl<CudaMemory>
{
static Vcluster<CudaMemory> & get()
{
return *global_v_cluster_private_cuda;
}
};
#endif template<typename Memory = HeapMemory>
static inline Vcluster<Memory> & create_vcluster()
{
if (global_v_cluster_private_heap == NULL)
{std::cerr << __FILE__ << ":" << __LINE__ << " Error you must call openfpm_init before using any distributed data structures";}
return *global_v_cluster_private; return get_vcl<Memory>::get();
} }
......
...@@ -353,6 +353,12 @@ public: ...@@ -353,6 +353,12 @@ public:
*/ */
mgpu::standard_context_t & getmgpuContext() mgpu::standard_context_t & getmgpuContext()
{ {
if (context == NULL)
{
std::cout << __FILE__ << ":" << __LINE__ << " error: it seem that modern gpu context is not initialized."
"Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC" << std::endl;
}
return *context; return *context;
} }
......
...@@ -11,13 +11,13 @@ ...@@ -11,13 +11,13 @@
#include "memory/BHeapMemory.hpp" #include "memory/BHeapMemory.hpp"
#include "Packer_Unpacker/has_max_prop.hpp" #include "Packer_Unpacker/has_max_prop.hpp"
template<bool result, typename T, typename S, template<typename> class layout_base> template<bool result, typename T, typename S, template<typename> class layout_base, typename Memory>
struct unpack_selector_with_prp struct unpack_selector_with_prp
{ {
template<typename op, template<typename op,
int ... prp> int ... prp>
static void call_unpack(S & recv, static void call_unpack(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz, openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte, openfpm::vector<size_t> * sz_byte,
op & op_param, op & op_param,
...@@ -134,14 +134,14 @@ struct unpack_each_prop_buffer ...@@ -134,14 +134,14 @@ struct unpack_each_prop_buffer
* *
*/ */
template<typename sT, template<typename> class layout_base> template<typename sT, template<typename> class layout_base,typename Memory>
struct process_receive_mem_traits_inte struct process_receive_mem_traits_inte
{ {
//! set of pointers //! set of pointers
size_t i; size_t i;
//! Receive buffer //! Receive buffer
openfpm::vector<BMemory<HeapMemory>> & recv_buf; openfpm::vector<BMemory<Memory>> & recv_buf;
//! Fake vector that map over received memory //! Fake vector that map over received memory
openfpm::vector<typename sT::value_type,PtrMemory,typename layout_base<typename sT::value_type>::type,layout_base,openfpm::grow_policy_identity> & v2; openfpm::vector<typename sT::value_type,PtrMemory,typename layout_base<typename sT::value_type>::type,layout_base,openfpm::grow_policy_identity> & v2;
...@@ -157,7 +157,7 @@ struct process_receive_mem_traits_inte ...@@ -157,7 +157,7 @@ struct process_receive_mem_traits_inte
* *
*/ */
inline process_receive_mem_traits_inte(openfpm::vector<typename sT::value_type,PtrMemory,typename layout_base<typename sT::value_type>::type,layout_base,openfpm::grow_policy_identity> & v2, inline process_receive_mem_traits_inte(openfpm::vector<typename sT::value_type,PtrMemory,typename layout_base<typename sT::value_type>::type,layout_base,openfpm::grow_policy_identity> & v2,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
size_t i, size_t i,
size_t opt) size_t opt)
:i(i),recv_buf(recv_buf),v2(v2),opt(opt) :i(i),recv_buf(recv_buf),v2(v2),opt(opt)
...@@ -178,7 +178,7 @@ struct process_receive_mem_traits_inte ...@@ -178,7 +178,7 @@ struct process_receive_mem_traits_inte
{ {
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
// add the received particles to the vector // add the received particles to the vector
ptr1 = new PtrMemory(recv_buf.get(i).getDevicePointer(),recv_buf.get(i).size()); ptr1 = new PtrMemory(recv_buf.get(i).getDevicePointerNoCopy(),recv_buf.get(i).size());
#else #else
// add the received particles to the vector // add the received particles to the vector
ptr1 = new PtrMemory(recv_buf.get(i).getPointer(),recv_buf.get(i).size()); ptr1 = new PtrMemory(recv_buf.get(i).getPointer(),recv_buf.get(i).size());
...@@ -196,11 +196,11 @@ struct process_receive_mem_traits_inte ...@@ -196,11 +196,11 @@ struct process_receive_mem_traits_inte
} }
}; };
template<bool inte_or_lin,typename T, typename S, template<typename> class layout_base> template<bool inte_or_lin,typename T, typename S, template<typename> class layout_base,typename Memory>
struct unpack_selector_with_prp_lin struct unpack_selector_with_prp_lin
{ {
template<typename op, unsigned int ... prp> static int call_unpack_impl(S & recv, template<typename op, unsigned int ... prp> static int call_unpack_impl(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz, openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte, openfpm::vector<size_t> * sz_byte,
op & op_param, op & op_param,
...@@ -210,7 +210,7 @@ struct unpack_selector_with_prp_lin ...@@ -210,7 +210,7 @@ struct unpack_selector_with_prp_lin
// create vector representation to a piece of memory already allocated // create vector representation to a piece of memory already allocated
openfpm::vector<typename T::value_type,PtrMemory,typename layout_base<typename T::value_type>::type,layout_base,openfpm::grow_policy_identity> v2; openfpm::vector<typename T::value_type,PtrMemory,typename layout_base<typename T::value_type>::type,layout_base,openfpm::grow_policy_identity> v2;
process_receive_mem_traits_inte<T,layout_base> prmti(v2,recv_buf,i,opt); process_receive_mem_traits_inte<T,layout_base,Memory> prmti(v2,recv_buf,i,opt);
boost::mpl::for_each_ref<boost::mpl::range_c<int,0,T::value_type::max_prop>>(prmti); boost::mpl::for_each_ref<boost::mpl::range_c<int,0,T::value_type::max_prop>>(prmti);
...@@ -233,11 +233,11 @@ struct unpack_selector_with_prp_lin ...@@ -233,11 +233,11 @@ struct unpack_selector_with_prp_lin
} }
}; };
template<typename T, typename S, template<typename> class layout_base> template<typename T, typename S, template<typename> class layout_base, typename Memory>
struct unpack_selector_with_prp_lin<true,T,S,layout_base> struct unpack_selector_with_prp_lin<true,T,S,layout_base,Memory>
{ {
template<typename op, unsigned int ... prp> static int call_unpack_impl(S & recv, template<typename op, unsigned int ... prp> static int call_unpack_impl(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz, openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte, openfpm::vector<size_t> * sz_byte,
op & op_param, op & op_param,
...@@ -278,11 +278,11 @@ struct unpack_selector_with_prp_lin<true,T,S,layout_base> ...@@ -278,11 +278,11 @@ struct unpack_selector_with_prp_lin<true,T,S,layout_base>
typedef aggregate<int,int> dummy_type; typedef aggregate<int,int> dummy_type;
// //
template<typename T, typename S, template<typename> class layout_base> template<typename T, typename S, template<typename> class layout_base, typename Memory>
struct unpack_selector_with_prp<true,T,S,layout_base> struct unpack_selector_with_prp<true,T,S,layout_base,Memory>
{ {
template<typename op, unsigned int ... prp> static void call_unpack(S & recv, template<typename op, unsigned int ... prp> static void call_unpack(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz, openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte, openfpm::vector<size_t> * sz_byte,
op & op_param, op & op_param,
...@@ -293,7 +293,7 @@ struct unpack_selector_with_prp<true,T,S,layout_base> ...@@ -293,7 +293,7 @@ struct unpack_selector_with_prp<true,T,S,layout_base>
for (size_t i = 0 ; i < recv_buf.size() ; ) for (size_t i = 0 ; i < recv_buf.size() ; )
{ {
i += unpack_selector_with_prp_lin<is_layout_mlin<layout_base<dummy_type>>::value,T,S,layout_base>::template call_unpack_impl<op,prp...>(recv,recv_buf,sz,sz_byte,op_param,i,opt); i += unpack_selector_with_prp_lin<is_layout_mlin<layout_base<dummy_type>>::value,T,S,layout_base,Memory>::template call_unpack_impl<op,prp...>(recv,recv_buf,sz,sz_byte,op_param,i,opt);
} }
} }
}; };
...@@ -315,9 +315,9 @@ struct call_serialize_variadic<index_tuple<prp...>> ...@@ -315,9 +315,9 @@ struct call_serialize_variadic<index_tuple<prp...>>
Packer<T,HeapMemory>::template pack<prp...>(mem,send,sts); Packer<T,HeapMemory>::template pack<prp...>(mem,send,sts);
} }
template<typename op, typename T, typename S, template<typename> class layout_base> template<typename op, typename T, typename S, template<typename> class layout_base, typename Memory>
inline static void call_unpack(S & recv, inline static void call_unpack(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz, openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte, openfpm::vector<size_t> * sz_byte,
op & op_param, op & op_param,
...@@ -325,7 +325,7 @@ struct call_serialize_variadic<index_tuple<prp...>> ...@@ -325,7 +325,7 @@ struct call_serialize_variadic<index_tuple<prp...>>
{ {
const bool result = has_pack_gen<typename T::value_type>::value == false && is_vector<T>::value == true; const bool result = has_pack_gen<typename T::value_type>::value == false && is_vector<T>::value == true;
unpack_selector_with_prp<result, T, S,layout_base>::template call_unpack<op,prp...>(recv, recv_buf, sz, sz_byte, op_param,opt); unpack_selector_with_prp<result, T, S,layout_base,Memory>::template call_unpack<op,prp...>(recv, recv_buf, sz, sz_byte, op_param,opt);
} }
}; };
...@@ -505,8 +505,9 @@ struct pack_unpack_cond_with_prp ...@@ -505,8 +505,9 @@ struct pack_unpack_cond_with_prp
} }
} }
template<typename Memory>
static void unpacking(S & recv, static void unpacking(S & recv,
openfpm::vector<BMemory<HeapMemory>> & recv_buf, openfpm::vector<BMemory<Memory>> & recv_buf,
openfpm::vector<size_t> * sz, openfpm::vector<size_t> * sz,
openfpm::vector<size_t> * sz_byte, openfpm::vector<size_t> * sz_byte,
op & op_param, op & op_param,
...@@ -673,7 +674,7 @@ struct op_ssend_recv_merge ...@@ -673,7 +674,7 @@ struct op_ssend_recv_merge
typename S, typename S,
template <typename> class layout_base, template <typename> class layout_base,
int ... prp> int ... prp>
void execute(D & recv,S & v2,size_t i) void execute(D & recv,S & v2,size_t i,size_t opt)
{ {
op_ssend_recv_merge_impl<sr,op>::template execute<T,D,S,layout_base,prp...>(recv,v2,i,opart); op_ssend_recv_merge_impl<sr,op>::template execute<T,D,S,layout_base,prp...>(recv,v2,i,opart);
} }
...@@ -739,7 +740,7 @@ struct op_ssend_gg_recv_merge ...@@ -739,7 +740,7 @@ struct op_ssend_gg_recv_merge
{} {}
//! execute the merge //! execute the merge
template<bool sr, typename T, typename D, typename S, template<typename> class layout_base, int ... prp> void execute(D & recv,S & v2,size_t i) template<bool sr, typename T, typename D, typename S, template<typename> class layout_base, int ... prp> void execute(D & recv,S & v2,size_t i,size_t opt)
{ {
op_ssend_gg_recv_merge_impl<sr>::template execute<T,D,S,layout_base,prp...>(recv,v2,i,start); op_ssend_gg_recv_merge_impl<sr>::template execute<T,D,S,layout_base,prp...>(recv,v2,i,start);
} }
......
...@@ -1633,12 +1633,6 @@ BOOST_AUTO_TEST_CASE (Vcluster_semantic_sendrecv_6) ...@@ -1633,12 +1633,6 @@ BOOST_AUTO_TEST_CASE (Vcluster_semantic_sendrecv_6)
} }
BOOST_AUTO_TEST_CASE( Vcluster_semantic_ssend_recv_layout_switch )
{
test_ssend_recv_layout_switch(0);
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()
#endif /* OPENFPM_VCLUSTER_SRC_VCLUSTER_SEMANTIC_UNIT_TESTS_HPP_ */ #endif /* OPENFPM_VCLUSTER_SRC_VCLUSTER_SEMANTIC_UNIT_TESTS_HPP_ */
...@@ -4,125 +4,18 @@ ...@@ -4,125 +4,18 @@
#include "VCluster/VCluster.hpp" #include "VCluster/VCluster.hpp"
#include "VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp" #include "VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp"
void test_ssend_recv_layout_switch(size_t opt)
{
auto & v_cl = create_vcluster();
if (v_cl.size() > 10) {return;}
openfpm::vector<openfpm::vector_gpu_single<aggregate<float,float[3]>>> vd;
openfpm::vector_gpu<aggregate<float,float[3]>> collect;
openfpm::vector_gpu<aggregate<float,float[3]>> collect2;
openfpm::vector<size_t> prc_send;
openfpm::vector<size_t> prc_recv;
openfpm::vector<size_t> sz_recv;
vd.resize(v_cl.size());
for (size_t i = 0 ; i < vd.size() ; i++)
{
vd.get(i).resize(100);
for (size_t j = 0 ; j < vd.get(i).size() ; j++)
{
vd.get(i).template get<0>(j) = 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[0] = 400000 + 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[1] = 400000 + 10000*i + v_cl.rank()*100 + j;
vd.get(i).template get<1>(j)[2] = 400000 + 10000*i + v_cl.rank()*100 + j;
}
prc_send.add(i);
if (opt & MPI_GPU_DIRECT)
{
vd.get(i).template hostToDevice<0,1>();
// Reset host
for (size_t j = 0 ; j < vd.get(i).size() ; j++)
{
vd.get(i).template get<0>(j) = 0.0;
vd.get(i).template get<1>(j)[0] = 0.0;
vd.get(i).template get<1>(j)[1] = 0.0;
vd.get(i).template get<1>(j)[2] = 0.0;
}
}
}
v_cl.SSendRecv<openfpm::vector_gpu_single<aggregate<float,float[3]>>,decltype(collect),memory_traits_inte>
(vd,collect,prc_send, prc_recv,sz_recv,opt);
v_cl.SSendRecvP<openfpm::vector_gpu_single<aggregate<float,float[3]>>,decltype(collect),memory_traits_inte,0,1>
(vd,collect2,prc_send, prc_recv,sz_recv,opt);
// collect must have 100 * v_cl.size()
BOOST_REQUIRE_EQUAL(collect.size(),100*v_cl.size());
BOOST_REQUIRE_EQUAL(collect2.size(),100*v_cl.size());
// we reset the host collected data if data must be on device
if (opt & MPI_GPU_DIRECT) BOOST_AUTO_TEST_SUITE( VCluster_cuda_tests )
{
for (size_t j = 0 ; j < collect.size() ; j++)
{
collect.template get<0>(j) = 0.0;
collect.template get<1>(j)[0] = 0.0;
collect.template get<1>(j)[1] = 0.0;
collect.template get<1>(j)[2] = 0.0;
collect2.template get<0>(j) = 0.0;
collect2.template get<1>(j)[0] = 0.0;
collect2.template get<1>(j)[1] = 0.0;
collect2.template get<1>(j)[2] = 0.0;
}
}
// from device to host
if (opt & MPI_GPU_DIRECT)
{
collect.template deviceToHost<0,1>();
collect2.template deviceToHost<0,1>();
}
// now we check what we received
// check what we received
bool match = true;
for (size_t i = 0 ; i < v_cl.size() ; i++)
{
for (size_t j = 0 ; j < 100 ; j++)
{
match &= collect.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<0>(i*100 +j) == v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[0] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[1] == 400000 + v_cl.rank()*10000 + i*100 + j;
match &= collect2.template get<1>(i*100 +j)[2] == 400000 + v_cl.rank()*10000 + i*100 + j;
}
if (match == false){break;}
}
BOOST_REQUIRE_EQUAL(match,true); BOOST_AUTO_TEST_CASE( Vcluster_semantic_ssend_recv_layout_switch )
{
test_ssend_recv_layout_switch<HeapMemory>(0);
} }
BOOST_AUTO_TEST_SUITE( VCluster_cuda_tests )
BOOST_AUTO_TEST_CASE( Vcluster_semantic_gpu_direct ) BOOST_AUTO_TEST_CASE( Vcluster_semantic_gpu_direct )
{ {
test_ssend_recv_layout_switch(MPI_GPU_DIRECT); test_ssend_recv_layout_switch<CudaMemory>(MPI_GPU_DIRECT);
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()
...@@ -8,7 +8,121 @@ ...@@ -8,7 +8,121 @@
#ifndef VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_ #ifndef VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_
#define VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_ #define VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_
void test_ssend_recv_layout_switch(size_t opt); template<typename Memory>
void test_ssend_recv_layout_switch(size_t opt)