Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
O
openfpm_vcluster
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Requirements
Requirements
List
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Code Review
Insights
Issue
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
openfpm
openfpm_vcluster
Commits
11a0f824
Commit
11a0f824
authored
Aug 24, 2018
by
incardon
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Changes to make it work with GPU Direct
parent
ec95a342
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
203 additions
and
165 deletions
+203
-165
src/VCluster/VCluster.cpp
src/VCluster/VCluster.cpp
+2
-1
src/VCluster/VCluster.hpp
src/VCluster/VCluster.hpp
+52
-23
src/VCluster/VCluster_base.hpp
src/VCluster/VCluster_base.hpp
+6
-0
src/VCluster/VCluster_meta_function.hpp
src/VCluster/VCluster_meta_function.hpp
+23
-22
src/VCluster/VCluster_semantic_unit_tests.hpp
src/VCluster/VCluster_semantic_unit_tests.hpp
+0
-6
src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu
src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu
+5
-112
src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp
src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp
+115
-1
No files found.
src/VCluster/VCluster.cpp
View file @
11a0f824
...
...
@@ -8,7 +8,8 @@
#include "util/print_stack.hpp"
#include "util/math_util_complex.hpp"
Vcluster
<>
*
global_v_cluster_private
=
NULL
;
Vcluster
<>
*
global_v_cluster_private_heap
=
NULL
;
Vcluster
<
CudaMemory
>
*
global_v_cluster_private_cuda
=
NULL
;
//
std
::
vector
<
int
>
sieve_spf
;
...
...
src/VCluster/VCluster.hpp
View file @
11a0f824
...
...
@@ -55,6 +55,14 @@ class Vcluster: public Vcluster_base<InternalMemory>
inline
static
void
process_recv
(
Vcluster
&
vcl
,
S
&
recv
,
openfpm
::
vector
<
size_t
>
*
sz_recv
,
openfpm
::
vector
<
size_t
>
*
sz_recv_byte
,
op
&
op_param
,
size_t
opt
)
{
if
(
opt
==
MPI_GPU_DIRECT
&&
!
std
::
is_same
<
InternalMemory
,
CudaMemory
>::
value
)
{
// In order to have this option activated InternalMemory must be CudaMemory
std
::
cout
<<
__FILE__
<<
":"
<<
__LINE__
<<
" error: in order to have MPI_GPU_DIRECT VCluster must use CudaMemory internally, the most probable"
<<
" cause of this problem is that you are using MPI_GPU_DIRECT option with a non-GPU data-structure"
<<
std
::
endl
;
}
vcl
.
process_receive_buffer_with_prp
<
op
,
T
,
S
,
layout_base
,
prp
...
>
(
recv
,
sz_recv
,
sz_recv_byte
,
op_param
,
opt
);
}
};
...
...
@@ -131,7 +139,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base
::
tags
.
clear
();
// receive information
base_info
bi
(
&
this
->
recv_buf
,
prc_recv
,
sz_recv_byte
,
this
->
tags
,
opt
);
base_info
<
InternalMemory
>
bi
(
&
this
->
recv_buf
,
prc_recv
,
sz_recv_byte
,
this
->
tags
,
opt
);
// Send and recv multiple messages
if
(
opt
&
RECEIVE_KNOWN
)
...
...
@@ -191,10 +199,11 @@ class Vcluster: public Vcluster_base<InternalMemory>
* \param size of the received data
*
*/
template
<
typename
Memory
>
struct
base_info
{
//! Receive buffer
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
*
recv_buf
;
openfpm
::
vector
<
BMemory
<
Memory
>>
*
recv_buf
;
//! receiving processor list
openfpm
::
vector
<
size_t
>
&
prc
;
//! size of each message
...
...
@@ -206,7 +215,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
size_t
opt
;
//! constructor
base_info
(
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
*
recv_buf
,
openfpm
::
vector
<
size_t
>
&
prc
,
openfpm
::
vector
<
size_t
>
&
sz
,
openfpm
::
vector
<
size_t
>
&
tags
,
size_t
opt
)
base_info
(
openfpm
::
vector
<
BMemory
<
Memory
>>
*
recv_buf
,
openfpm
::
vector
<
size_t
>
&
prc
,
openfpm
::
vector
<
size_t
>
&
sz
,
openfpm
::
vector
<
size_t
>
&
tags
,
size_t
opt
)
:
recv_buf
(
recv_buf
),
prc
(
prc
),
sz
(
sz
),
tags
(
tags
),
opt
(
opt
)
{}
};
...
...
@@ -226,7 +235,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
*/
static
void
*
msg_alloc
(
size_t
msg_i
,
size_t
total_msg
,
size_t
total_p
,
size_t
i
,
size_t
ri
,
size_t
tag
,
void
*
ptr
)
{
base_info
&
rinfo
=
*
(
base_info
*
)
ptr
;
base_info
<
InternalMemory
>
&
rinfo
=
*
(
base_info
<
InternalMemory
>
*
)
ptr
;
if
(
rinfo
.
recv_buf
==
NULL
)
{
...
...
@@ -249,7 +258,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
if
(
rinfo
.
opt
&
MPI_GPU_DIRECT
)
{
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
return
rinfo
.
recv_buf
->
last
().
getDevicePointer
();
return
rinfo
.
recv_buf
->
last
().
getDevicePointer
NoCopy
();
#else
return
rinfo
.
recv_buf
->
last
().
getPointer
();
#endif
...
...
@@ -274,7 +283,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
*/
static
void
*
msg_alloc_known
(
size_t
msg_i
,
size_t
total_msg
,
size_t
total_p
,
size_t
i
,
size_t
ri
,
size_t
tag
,
void
*
ptr
)
{
base_info
&
rinfo
=
*
(
base_info
*
)
ptr
;
base_info
<
InternalMemory
>
&
rinfo
=
*
(
base_info
<
InternalMemory
>
*
)
ptr
;
if
(
rinfo
.
recv_buf
==
NULL
)
{
...
...
@@ -425,7 +434,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base
::
tags
.
clear
();
// receive information
base_info
bi
(
&
this
->
recv_buf
,
prc
,
sz
,
this
->
tags
,
0
);
base_info
<
InternalMemory
>
bi
(
&
this
->
recv_buf
,
prc
,
sz
,
this
->
tags
,
0
);
// Send and recv multiple messages
self_base
::
sendrecvMultipleMessagesNBX
(
send_req
.
size
(),
NULL
,
NULL
,
NULL
,
msg_alloc
,
&
bi
);
...
...
@@ -479,7 +488,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base
::
tags
.
clear
();
// receive information
base_info
bi
(
NULL
,
prc
,
sz
,
self_base
::
tags
,
0
);
base_info
<
InternalMemory
>
bi
(
NULL
,
prc
,
sz
,
self_base
::
tags
,
0
);
// Send and recv multiple messages
self_base
::
sendrecvMultipleMessagesNBX
(
send_prc_
.
size
(),(
size_t
*
)
sz
.
getPointer
(),(
size_t
*
)
send_prc_
.
getPointer
(),(
void
**
)
send_buf
.
getPointer
(),
msg_alloc
,(
void
*
)
&
bi
,
NONE
);
...
...
@@ -544,7 +553,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base
::
tags
.
clear
();
// receive information
base_info
bi
(
&
this
->
recv_buf
,
prc
,
sz
,
this
->
tags
,
0
);
base_info
<
InternalMemory
>
bi
(
&
this
->
recv_buf
,
prc
,
sz
,
this
->
tags
,
0
);
// Send and recv multiple messages
self_base
::
sendrecvMultipleMessagesNBX
(
prc
.
size
(),(
size_t
*
)
sz_byte
.
getPointer
(),(
size_t
*
)
prc
.
getPointer
(),(
void
**
)
send_buf
.
getPointer
(),
msg_alloc
,(
void
*
)
&
bi
);
...
...
@@ -565,7 +574,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
self_base
::
tags
.
clear
();
// receive information
base_info
bi
(
&
this
->
recv_buf
,
prc
,
sz
,
this
->
tags
,
0
);
base_info
<
InternalMemory
>
bi
(
&
this
->
recv_buf
,
prc
,
sz
,
this
->
tags
,
0
);
// Send and recv multiple messages
self_base
::
sendrecvMultipleMessagesNBX
(
send_req
.
size
(),
NULL
,
NULL
,
NULL
,
msg_alloc
,
&
bi
);
...
...
@@ -630,7 +639,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
// we sort based on processor
rcv
.
sort
();
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
recv_ord
;
openfpm
::
vector
<
BMemory
<
Internal
Memory
>>
recv_ord
;
recv_ord
.
resize
(
rcv
.
size
());
openfpm
::
vector
<
size_t
>
prc_ord
;
...
...
@@ -747,7 +756,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
op_ssend_recv_add
<
void
>
opa
;
// process the received information
process_receive_buffer_with_prp
<
op_ssend_recv_add
<
void
>
,
T
,
S
,
layout_base
,
prp
...
>
(
recv
,
&
sz_recv
,
&
sz_recv_byte
,
opa
);
process_receive_buffer_with_prp
<
op_ssend_recv_add
<
void
>
,
T
,
S
,
layout_base
,
prp
...
>
(
recv
,
&
sz_recv
,
&
sz_recv_byte
,
opa
,
opt
);
return
true
;
}
...
...
@@ -851,7 +860,7 @@ class Vcluster: public Vcluster_base<InternalMemory>
prepare_send_buffer
<
op
,
T
,
S
,
layout_base
>
(
send
,
recv
,
prc_send
,
prc_recv
,
recv_sz
,
opt
);
// process the received information
process_receive_buffer_with_prp
<
op
,
T
,
S
,
layout_base
,
prp
...
>
(
recv
,
NULL
,
NULL
,
op_param
);
process_receive_buffer_with_prp
<
op
,
T
,
S
,
layout_base
,
prp
...
>
(
recv
,
NULL
,
NULL
,
op_param
,
opt
);
return
true
;
}
...
...
@@ -862,7 +871,8 @@ class Vcluster: public Vcluster_base<InternalMemory>
// Function to initialize the global VCluster //
extern
Vcluster
<>
*
global_v_cluster_private
;
extern
Vcluster
<>
*
global_v_cluster_private_heap
;
extern
Vcluster
<
CudaMemory
>
*
global_v_cluster_private_cuda
;
/*! \brief Initialize a global instance of Runtime Virtual Cluster Machine
*
...
...
@@ -872,25 +882,44 @@ extern Vcluster<> * global_v_cluster_private;
static
inline
void
init_global_v_cluster_private
(
int
*
argc
,
char
***
argv
)
{
if
(
global_v_cluster_private
==
NULL
)
{
global_v_cluster_private
=
new
Vcluster
<>
(
argc
,
argv
);}
if
(
global_v_cluster_private_heap
==
NULL
)
{
global_v_cluster_private_heap
=
new
Vcluster
<>
(
argc
,
argv
);}
if
(
global_v_cluster_private_cuda
==
NULL
)
{
global_v_cluster_private_cuda
=
new
Vcluster
<
CudaMemory
>
(
argc
,
argv
);}
}
static
inline
void
delete_global_v_cluster_private
()
{
delete
global_v_cluster_private
;
delete
global_v_cluster_private_heap
;
delete
global_v_cluster_private_cuda
;
}
static
inline
Vcluster
<>
&
create_vcluster
()
template
<
typename
Memory
>
struct
get_vcl
{
#ifdef SE_CLASS1
static
Vcluster
<
Memory
>
&
get
()
{
return
*
global_v_cluster_private_heap
;
}
};
if
(
global_v_cluster_private
==
NULL
)
std
::
cerr
<<
__FILE__
<<
":"
<<
__LINE__
<<
" Error you must call openfpm_init before using any distributed data structures"
;
template
<
>
struct
get_vcl
<
CudaMemory
>
{
static
Vcluster
<
CudaMemory
>
&
get
()
{
return
*
global_v_cluster_private_cuda
;
}
};
#endif
template
<
typename
Memory
=
HeapMemory
>
static
inline
Vcluster
<
Memory
>
&
create_vcluster
()
{
if
(
global_v_cluster_private_heap
==
NULL
)
{
std
::
cerr
<<
__FILE__
<<
":"
<<
__LINE__
<<
" Error you must call openfpm_init before using any distributed data structures"
;}
return
*
global_v_cluster_private
;
return
get_vcl
<
Memory
>::
get
()
;
}
...
...
src/VCluster/VCluster_base.hpp
View file @
11a0f824
...
...
@@ -353,6 +353,12 @@ public:
*/
mgpu
::
standard_context_t
&
getmgpuContext
()
{
if
(
context
==
NULL
)
{
std
::
cout
<<
__FILE__
<<
":"
<<
__LINE__
<<
" error: it seem that modern gpu context is not initialized."
"Either a compatible working cuda device has not been found, either openfpm_init has been called in a file that not compiled with NVCC"
<<
std
::
endl
;
}
return
*
context
;
}
...
...
src/VCluster/VCluster_meta_function.hpp
View file @
11a0f824
...
...
@@ -11,13 +11,13 @@
#include "memory/BHeapMemory.hpp"
#include "Packer_Unpacker/has_max_prop.hpp"
template
<
bool
result
,
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
>
template
<
bool
result
,
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
,
typename
Memory
>
struct
unpack_selector_with_prp
{
template
<
typename
op
,
int
...
prp
>
static
void
call_unpack
(
S
&
recv
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
size_t
>
*
sz
,
openfpm
::
vector
<
size_t
>
*
sz_byte
,
op
&
op_param
,
...
...
@@ -134,14 +134,14 @@ struct unpack_each_prop_buffer
*
*/
template
<
typename
sT
,
template
<
typename
>
class
layout_base
>
template
<
typename
sT
,
template
<
typename
>
class
layout_base
,
typename
Memory
>
struct
process_receive_mem_traits_inte
{
//! set of pointers
size_t
i
;
//! Receive buffer
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
;
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
;
//! Fake vector that map over received memory
openfpm
::
vector
<
typename
sT
::
value_type
,
PtrMemory
,
typename
layout_base
<
typename
sT
::
value_type
>::
type
,
layout_base
,
openfpm
::
grow_policy_identity
>
&
v2
;
...
...
@@ -157,7 +157,7 @@ struct process_receive_mem_traits_inte
*
*/
inline
process_receive_mem_traits_inte
(
openfpm
::
vector
<
typename
sT
::
value_type
,
PtrMemory
,
typename
layout_base
<
typename
sT
::
value_type
>::
type
,
layout_base
,
openfpm
::
grow_policy_identity
>
&
v2
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
size_t
i
,
size_t
opt
)
:
i
(
i
),
recv_buf
(
recv_buf
),
v2
(
v2
),
opt
(
opt
)
...
...
@@ -178,7 +178,7 @@ struct process_receive_mem_traits_inte
{
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
// add the received particles to the vector
ptr1
=
new
PtrMemory
(
recv_buf
.
get
(
i
).
getDevicePointer
(),
recv_buf
.
get
(
i
).
size
());
ptr1
=
new
PtrMemory
(
recv_buf
.
get
(
i
).
getDevicePointer
NoCopy
(),
recv_buf
.
get
(
i
).
size
());
#else
// add the received particles to the vector
ptr1
=
new
PtrMemory
(
recv_buf
.
get
(
i
).
getPointer
(),
recv_buf
.
get
(
i
).
size
());
...
...
@@ -196,11 +196,11 @@ struct process_receive_mem_traits_inte
}
};
template
<
bool
inte_or_lin
,
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
>
template
<
bool
inte_or_lin
,
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
,
typename
Memory
>
struct
unpack_selector_with_prp_lin
{
template
<
typename
op
,
unsigned
int
...
prp
>
static
int
call_unpack_impl
(
S
&
recv
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
size_t
>
*
sz
,
openfpm
::
vector
<
size_t
>
*
sz_byte
,
op
&
op_param
,
...
...
@@ -210,7 +210,7 @@ struct unpack_selector_with_prp_lin
// create vector representation to a piece of memory already allocated
openfpm
::
vector
<
typename
T
::
value_type
,
PtrMemory
,
typename
layout_base
<
typename
T
::
value_type
>::
type
,
layout_base
,
openfpm
::
grow_policy_identity
>
v2
;
process_receive_mem_traits_inte
<
T
,
layout_base
>
prmti
(
v2
,
recv_buf
,
i
,
opt
);
process_receive_mem_traits_inte
<
T
,
layout_base
,
Memory
>
prmti
(
v2
,
recv_buf
,
i
,
opt
);
boost
::
mpl
::
for_each_ref
<
boost
::
mpl
::
range_c
<
int
,
0
,
T
::
value_type
::
max_prop
>>
(
prmti
);
...
...
@@ -233,11 +233,11 @@ struct unpack_selector_with_prp_lin
}
};
template
<
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
>
struct
unpack_selector_with_prp_lin
<
true
,
T
,
S
,
layout_base
>
template
<
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
,
typename
Memory
>
struct
unpack_selector_with_prp_lin
<
true
,
T
,
S
,
layout_base
,
Memory
>
{
template
<
typename
op
,
unsigned
int
...
prp
>
static
int
call_unpack_impl
(
S
&
recv
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
size_t
>
*
sz
,
openfpm
::
vector
<
size_t
>
*
sz_byte
,
op
&
op_param
,
...
...
@@ -278,11 +278,11 @@ struct unpack_selector_with_prp_lin<true,T,S,layout_base>
typedef
aggregate
<
int
,
int
>
dummy_type
;
//
template
<
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
>
struct
unpack_selector_with_prp
<
true
,
T
,
S
,
layout_base
>
template
<
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
,
typename
Memory
>
struct
unpack_selector_with_prp
<
true
,
T
,
S
,
layout_base
,
Memory
>
{
template
<
typename
op
,
unsigned
int
...
prp
>
static
void
call_unpack
(
S
&
recv
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
size_t
>
*
sz
,
openfpm
::
vector
<
size_t
>
*
sz_byte
,
op
&
op_param
,
...
...
@@ -293,7 +293,7 @@ struct unpack_selector_with_prp<true,T,S,layout_base>
for
(
size_t
i
=
0
;
i
<
recv_buf
.
size
()
;
)
{
i
+=
unpack_selector_with_prp_lin
<
is_layout_mlin
<
layout_base
<
dummy_type
>>::
value
,
T
,
S
,
layout_base
>::
template
call_unpack_impl
<
op
,
prp
...>(
recv
,
recv_buf
,
sz
,
sz_byte
,
op_param
,
i
,
opt
);
i
+=
unpack_selector_with_prp_lin
<
is_layout_mlin
<
layout_base
<
dummy_type
>>::
value
,
T
,
S
,
layout_base
,
Memory
>::
template
call_unpack_impl
<
op
,
prp
...>(
recv
,
recv_buf
,
sz
,
sz_byte
,
op_param
,
i
,
opt
);
}
}
};
...
...
@@ -315,9 +315,9 @@ struct call_serialize_variadic<index_tuple<prp...>>
Packer
<
T
,
HeapMemory
>::
template
pack
<
prp
...>(
mem
,
send
,
sts
);
}
template
<
typename
op
,
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
>
template
<
typename
op
,
typename
T
,
typename
S
,
template
<
typename
>
class
layout_base
,
typename
Memory
>
inline
static
void
call_unpack
(
S
&
recv
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
size_t
>
*
sz
,
openfpm
::
vector
<
size_t
>
*
sz_byte
,
op
&
op_param
,
...
...
@@ -325,7 +325,7 @@ struct call_serialize_variadic<index_tuple<prp...>>
{
const
bool
result
=
has_pack_gen
<
typename
T
::
value_type
>::
value
==
false
&&
is_vector
<
T
>::
value
==
true
;
unpack_selector_with_prp
<
result
,
T
,
S
,
layout_base
>::
template
call_unpack
<
op
,
prp
...>(
recv
,
recv_buf
,
sz
,
sz_byte
,
op_param
,
opt
);
unpack_selector_with_prp
<
result
,
T
,
S
,
layout_base
,
Memory
>::
template
call_unpack
<
op
,
prp
...>(
recv
,
recv_buf
,
sz
,
sz_byte
,
op_param
,
opt
);
}
};
...
...
@@ -505,8 +505,9 @@ struct pack_unpack_cond_with_prp
}
}
template
<
typename
Memory
>
static
void
unpacking
(
S
&
recv
,
openfpm
::
vector
<
BMemory
<
Heap
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
BMemory
<
Memory
>>
&
recv_buf
,
openfpm
::
vector
<
size_t
>
*
sz
,
openfpm
::
vector
<
size_t
>
*
sz_byte
,
op
&
op_param
,
...
...
@@ -673,7 +674,7 @@ struct op_ssend_recv_merge
typename
S
,
template
<
typename
>
class
layout_base
,
int
...
prp
>
void
execute
(
D
&
recv
,
S
&
v2
,
size_t
i
)
void
execute
(
D
&
recv
,
S
&
v2
,
size_t
i
,
size_t
opt
)
{
op_ssend_recv_merge_impl
<
sr
,
op
>::
template
execute
<
T
,
D
,
S
,
layout_base
,
prp
...>(
recv
,
v2
,
i
,
opart
);
}
...
...
@@ -739,7 +740,7 @@ struct op_ssend_gg_recv_merge
{}
//! execute the merge
template
<
bool
sr
,
typename
T
,
typename
D
,
typename
S
,
template
<
typename
>
class
layout_base
,
int
...
prp
>
void
execute
(
D
&
recv
,
S
&
v2
,
size_t
i
)
template
<
bool
sr
,
typename
T
,
typename
D
,
typename
S
,
template
<
typename
>
class
layout_base
,
int
...
prp
>
void
execute
(
D
&
recv
,
S
&
v2
,
size_t
i
,
size_t
opt
)
{
op_ssend_gg_recv_merge_impl
<
sr
>::
template
execute
<
T
,
D
,
S
,
layout_base
,
prp
...>(
recv
,
v2
,
i
,
start
);
}
...
...
src/VCluster/VCluster_semantic_unit_tests.hpp
View file @
11a0f824
...
...
@@ -1633,12 +1633,6 @@ BOOST_AUTO_TEST_CASE (Vcluster_semantic_sendrecv_6)
}
BOOST_AUTO_TEST_CASE
(
Vcluster_semantic_ssend_recv_layout_switch
)
{
test_ssend_recv_layout_switch
(
0
);
}
BOOST_AUTO_TEST_SUITE_END
()
#endif
/* OPENFPM_VCLUSTER_SRC_VCLUSTER_SEMANTIC_UNIT_TESTS_HPP_ */
src/VCluster/cuda/VCluster_semantic_unit_cuda_tests.cu
View file @
11a0f824
...
...
@@ -4,125 +4,18 @@
#include "VCluster/VCluster.hpp"
#include "VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp"
void
test_ssend_recv_layout_switch
(
size_t
opt
)
{
auto
&
v_cl
=
create_vcluster
();
if
(
v_cl
.
size
()
>
10
)
{
return
;}
openfpm
::
vector
<
openfpm
::
vector_gpu_single
<
aggregate
<
float
,
float
[
3
]
>>>
vd
;
openfpm
::
vector_gpu
<
aggregate
<
float
,
float
[
3
]
>>
collect
;
openfpm
::
vector_gpu
<
aggregate
<
float
,
float
[
3
]
>>
collect2
;
openfpm
::
vector
<
size_t
>
prc_send
;
openfpm
::
vector
<
size_t
>
prc_recv
;
openfpm
::
vector
<
size_t
>
sz_recv
;
vd
.
resize
(
v_cl
.
size
());
for
(
size_t
i
=
0
;
i
<
vd
.
size
()
;
i
++
)
{
vd
.
get
(
i
).
resize
(
100
);
for
(
size_t
j
=
0
;
j
<
vd
.
get
(
i
).
size
()
;
j
++
)
{
vd
.
get
(
i
).
template
get
<
0
>(
j
)
=
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
0
]
=
400000
+
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
1
]
=
400000
+
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
2
]
=
400000
+
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
}
prc_send
.
add
(
i
);
if
(
opt
&
MPI_GPU_DIRECT
)
{
vd
.
get
(
i
).
template
hostToDevice
<
0
,
1
>();
// Reset host
for
(
size_t
j
=
0
;
j
<
vd
.
get
(
i
).
size
()
;
j
++
)
{
vd
.
get
(
i
).
template
get
<
0
>(
j
)
=
0.0
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
0
]
=
0.0
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
1
]
=
0.0
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
2
]
=
0.0
;
}
}
}
v_cl
.
SSendRecv
<
openfpm
::
vector_gpu_single
<
aggregate
<
float
,
float
[
3
]
>>
,
decltype
(
collect
),
memory_traits_inte
>
(
vd
,
collect
,
prc_send
,
prc_recv
,
sz_recv
,
opt
);
v_cl
.
SSendRecvP
<
openfpm
::
vector_gpu_single
<
aggregate
<
float
,
float
[
3
]
>>
,
decltype
(
collect
),
memory_traits_inte
,
0
,
1
>
(
vd
,
collect2
,
prc_send
,
prc_recv
,
sz_recv
,
opt
);
// collect must have 100 * v_cl.size()
BOOST_REQUIRE_EQUAL
(
collect
.
size
(),
100
*
v_cl
.
size
());
BOOST_REQUIRE_EQUAL
(
collect2
.
size
(),
100
*
v_cl
.
size
());
// we reset the host collected data if data must be on device
if
(
opt
&
MPI_GPU_DIRECT
)
{
for
(
size_t
j
=
0
;
j
<
collect
.
size
()
;
j
++
)
{
collect
.
template
get
<
0
>(
j
)
=
0.0
;
collect
.
template
get
<
1
>(
j
)[
0
]
=
0.0
;
collect
.
template
get
<
1
>(
j
)[
1
]
=
0.0
;
collect
.
template
get
<
1
>(
j
)[
2
]
=
0.0
;
collect2
.
template
get
<
0
>(
j
)
=
0.0
;
collect2
.
template
get
<
1
>(
j
)[
0
]
=
0.0
;
collect2
.
template
get
<
1
>(
j
)[
1
]
=
0.0
;
collect2
.
template
get
<
1
>(
j
)[
2
]
=
0.0
;
}
}
// from device to host
if
(
opt
&
MPI_GPU_DIRECT
)
{
collect
.
template
deviceToHost
<
0
,
1
>();
collect2
.
template
deviceToHost
<
0
,
1
>();
}
// now we check what we received
// check what we received
bool
match
=
true
;
for
(
size_t
i
=
0
;
i
<
v_cl
.
size
()
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
100
;
j
++
)
{
match
&=
collect
.
template
get
<
0
>(
i
*
100
+
j
)
==
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect
.
template
get
<
1
>(
i
*
100
+
j
)[
0
]
==
400000
+
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect
.
template
get
<
1
>(
i
*
100
+
j
)[
1
]
==
400000
+
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect
.
template
get
<
1
>(
i
*
100
+
j
)[
2
]
==
400000
+
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect2
.
template
get
<
0
>(
i
*
100
+
j
)
==
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect2
.
template
get
<
1
>(
i
*
100
+
j
)[
0
]
==
400000
+
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect2
.
template
get
<
1
>(
i
*
100
+
j
)[
1
]
==
400000
+
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
match
&=
collect2
.
template
get
<
1
>(
i
*
100
+
j
)[
2
]
==
400000
+
v_cl
.
rank
()
*
10000
+
i
*
100
+
j
;
}
if
(
match
==
false
){
break
;}
}
BOOST_AUTO_TEST_SUITE
(
VCluster_cuda_tests
)
BOOST_REQUIRE_EQUAL
(
match
,
true
);
BOOST_AUTO_TEST_CASE
(
Vcluster_semantic_ssend_recv_layout_switch
)
{
test_ssend_recv_layout_switch
<
HeapMemory
>
(
0
);
}
BOOST_AUTO_TEST_SUITE
(
VCluster_cuda_tests
)
BOOST_AUTO_TEST_CASE
(
Vcluster_semantic_gpu_direct
)
{
test_ssend_recv_layout_switch
(
MPI_GPU_DIRECT
);
test_ssend_recv_layout_switch
<
CudaMemory
>
(
MPI_GPU_DIRECT
);
}
BOOST_AUTO_TEST_SUITE_END
()
src/VCluster/cuda/VCluster_semantic_unit_tests_funcs.hpp
View file @
11a0f824
...
...
@@ -8,7 +8,121 @@
#ifndef VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_
#define VCLUSTER_SEMANTIC_UNIT_TESTS_FUNCS_HPP_
void
test_ssend_recv_layout_switch
(
size_t
opt
);
template
<
typename
Memory
>
void
test_ssend_recv_layout_switch
(
size_t
opt
)
{
auto
&
v_cl
=
create_vcluster
<
Memory
>
();
if
(
v_cl
.
size
()
>
10
)
{
return
;}
openfpm
::
vector
<
openfpm
::
vector_gpu_single
<
aggregate
<
float
,
float
[
3
]
>>>
vd
;
openfpm
::
vector_gpu
<
aggregate
<
float
,
float
[
3
]
>>
collect
;
openfpm
::
vector_gpu
<
aggregate
<
float
,
float
[
3
]
>>
collect2
;
openfpm
::
vector
<
size_t
>
prc_send
;
openfpm
::
vector
<
size_t
>
prc_recv
;
openfpm
::
vector
<
size_t
>
sz_recv
;
vd
.
resize
(
v_cl
.
size
());
for
(
size_t
i
=
0
;
i
<
vd
.
size
()
;
i
++
)
{
vd
.
get
(
i
).
resize
(
100
);
for
(
size_t
j
=
0
;
j
<
vd
.
get
(
i
).
size
()
;
j
++
)
{
vd
.
get
(
i
).
template
get
<
0
>(
j
)
=
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
0
]
=
400000
+
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
1
]
=
400000
+
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
2
]
=
400000
+
10000
*
i
+
v_cl
.
rank
()
*
100
+
j
;
}
prc_send
.
add
(
i
);
if
(
opt
&
MPI_GPU_DIRECT
)
{
vd
.
get
(
i
).
template
hostToDevice
<
0
,
1
>();
// Reset host
for
(
size_t
j
=
0
;
j
<
vd
.
get
(
i
).
size
()
;
j
++
)
{
vd
.
get
(
i
).
template
get
<
0
>(
j
)
=
0.0
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
0
]
=
0.0
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
1
]
=
0.0
;
vd
.
get
(
i
).
template
get
<
1
>(
j
)[
2
]
=
0.0
;
}
}
}
v_cl
.
template
SSendRecv
<
openfpm
::
vector_gpu_single
<
aggregate
<
float
,
float
[
3
]>
>
,
decltype
(
collect
),
memory_traits_inte
>
(
vd
,
collect
,
prc_send
,
prc_recv
,
sz_recv
,
opt
);
v_cl
.
template
SSendRecvP
<
openfpm
::
vector_gpu_single
<
aggregate
<
float
,
float
[
3
]>
>
,
decltype
(
collect
),
memory_traits_inte
,
0
,
1
>
(
vd
,
collect2
,
prc_send
,
prc_recv
,
sz_recv
,
opt
);
// collect must have 100 * v_cl.size()
BOOST_REQUIRE_EQUAL
(
collect
.
size
(),
100
*
v_cl
.
size
());
BOOST_REQUIRE_EQUAL
(
collect2
.
size
(),
100
*
v_cl
.
size
());
// we reset the host collected data if data must be on device
if
(
opt
&
MPI_GPU_DIRECT
)
{
for
(
size_t
j
=
0
;
j
<
collect
.
size
()
;
j
++
)
{
collect
.
template
get
<
0
>(
j
)
=
0.0
;
collect
.
template
get
<
1
>(
j
)[
0
]
=
0.0
;
collect
.
template
get
<
1
>(
j
)[
1
]
=
0.0
;
collect
.
template
get
<
1
>(
j
)[
2
]
=
0.0
;
collect2
.
template
get
<
0
>(
j
)
=
0.0
;
collect2
.
template
get
<
1
>(
j
)[
0
]
=
0.0
;
collect2
.
template
get
<
1
>(
j
)[
1
]
=
0.0
;
collect2
.
template
get
<
1
>(
j
)[
2
]
=
0.0
;
}
}
// from device to host
if
(
opt
&
MPI_GPU_DIRECT
)
{