Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
argupta
openfpm_pdata
Commits
1974b8bc
Commit
1974b8bc
authored
Nov 12, 2018
by
incardon
Browse files
Fixing GPU tests for pdata
parent
11ddfc33
Changes
13
Hide whitespace changes
Inline
Side-by-side
CMakeLists.txt
View file @
1974b8bc
cmake_minimum_required
(
VERSION 3.8 FATAL_ERROR
)
project
(
openfpm_pdata LANGUAGES C CXX
)
enable_testing
()
list
(
APPEND CMAKE_MODULE_PATH
${
CMAKE_CURRENT_LIST_DIR
}
/cmake_modules/
)
...
...
configure
View file @
1974b8bc
...
...
@@ -215,6 +215,7 @@ do
case
$ac_useropt
in
debug
)
conf_options
=
"
$conf_options
-DCMAKE_BUILD_TYPE=Debug"
debug_mode
=
1
;;
se_class1
)
conf_options
=
"
$conf_options
-DSE_CLASS1=ON"
...
...
@@ -524,6 +525,10 @@ Try \`$0 --help' for more information"
esac
done
if
[
x
"
$debug_mode
"
!=
x
"1"
]
;
then
conf_options+
=
"
$conf_options
-DCMAKE_BUILD_TYPE=Release"
fi
cd
build
## remove enerything
...
...
src/CMakeLists.txt
View file @
1974b8bc
cmake_minimum_required
(
VERSION 3.8 FATAL_ERROR
)
########################### Executables
if
(
CUDA_FOUND
)
...
...
@@ -16,6 +17,9 @@ endif()
add_library
(
ofpm_pdata STATIC lib/pdata.cpp
)
add_test
(
NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata
)
add_test
(
NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata
)
###########################
if
(
CUDA_FOUND
)
...
...
src/Decomposition/cuda/CartDecomposition_gpu.cuh
View file @
1974b8bc
...
...
@@ -105,7 +105,7 @@ public:
}
CartDecomposition_gpu
(
const
CartDecomposition_gpu
<
dim
,
T
,
Memory
,
layout_base
>
&
dec
)
:
ie_ghost_gpu
<
dim
,
T
,
Memory
,
layout_base
>
(
dec
),
clk
(
dec
.
clk
),
domain
(
dec
.
domain
)
:
ie_ghost_gpu
<
dim
,
T
,
Memory
,
layout_base
>
(
dec
),
clk
(
dec
.
clk
),
domain
(
dec
.
domain
)
,
sub_domains_global
(
dec
.
sub_domains_global
)
{
for
(
int
s
=
0
;
s
<
dim
;
s
++
)
{
this
->
bc
[
s
]
=
dec
.
bc
[
s
];}
...
...
src/Decomposition/cuda/decomposition_cuda_tests.cu
View file @
1974b8bc
...
...
@@ -41,7 +41,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
// Vcluster
Vcluster
<>
&
vcl
=
create_vcluster
();
CartDecomposition
<
3
,
double
>
dec
(
vcl
);
CartDecomposition
<
3
,
double
,
CudaMemory
,
memory_traits_inte
>
dec
(
vcl
);
size_t
bc
[
3
]
=
{
PERIODIC
,
PERIODIC
,
PERIODIC
};
...
...
@@ -87,6 +87,8 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
mem2
.
allocate
(
2
*
sizeof
(
unsigned
int
));
test_ghost_n
<
decltype
(
gpudec
)
><<<
1
,
1
>>>
(
p1
,
p2
,
gpudec
,(
unsigned
int
*
)
mem2
.
getDevicePointer
());
mem2
.
deviceToHost
();
unsigned
int
tot
=
((
unsigned
int
*
)
mem2
.
getPointer
())[
0
]
+
((
unsigned
int
*
)
mem2
.
getPointer
())[
1
];
openfpm
::
vector_gpu
<
aggregate
<
int
,
int
>>
vd
;
...
...
@@ -119,12 +121,16 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
test_proc_idbc
<
decltype
(
gpudec
)
><<<
1
,
1
>>>
(
p1
,
p2
,
gpudec
,(
unsigned
int
*
)
mem
.
getDevicePointer
());
mem
.
deviceToHost
();
BOOST_REQUIRE
(((
unsigned
int
*
)
mem
.
getPointer
())[
0
]
<
vcl
.
size
());
BOOST_REQUIRE
(((
unsigned
int
*
)
mem
.
getPointer
())[
1
]
<
vcl
.
size
());
mem2
.
allocate
(
2
*
sizeof
(
unsigned
int
));
test_ghost_n
<
decltype
(
gpudec
)
><<<
1
,
1
>>>
(
p1
,
p2
,
gpudec
,(
unsigned
int
*
)
mem2
.
getDevicePointer
());
mem2
.
deviceToHost
();
tot
=
((
unsigned
int
*
)
mem2
.
getPointer
())[
0
]
+
((
unsigned
int
*
)
mem2
.
getPointer
())[
1
];
vd
.
resize
(
tot
);
...
...
@@ -132,7 +138,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
if
(((
unsigned
int
*
)
mem
.
getPointer
())[
0
]
!=
((
unsigned
int
*
)
mem
.
getPointer
())[
1
])
{
if
(
vcl
.
rank
()
==
((
unsigned
int
*
)
mem
.
getPointer
())[
2
])
if
(
vcl
.
rank
()
==
((
unsigned
int
*
)
mem
.
getPointer
())[
1
])
{
BOOST_REQUIRE
(((
unsigned
int
*
)
mem2
.
getPointer
())[
1
]
!=
0
);
BOOST_REQUIRE
(((
unsigned
int
*
)
mem2
.
getPointer
())[
0
]
==
0
);
...
...
src/Decomposition/ie_ghost.hpp
View file @
1974b8bc
...
...
@@ -448,12 +448,17 @@ protected:
reorder_geo_cell
();
}
/*! \brief in this function we reorder the cells by processor
s
/*! \brief in this function we reorder the
list in each
cells by processor
id
*
*
In practice every processor in the list is ordered. the geo_cell give
*
suppose in one cell we have 7 boxes each box contain the processor id
*
* 7 boxes the first 2 boxes are related to processor 0 and the next 2 to processor 4, the other 3 must me related
* to another processor different from 0 and 4. This simplify the procedure to get a unique list of processor ids
* 1,5,9,5,1,1,6
*
* after reorder we have the following sequence
*
* 1,1,1,5,5,6,9
*
* This simplify the procedure to get a unique list of processor ids
* indicating on which processor a particle must be replicated as ghost
*
*/
...
...
src/Vector/cuda/vector_dist_comm_util_funcs.cuh
View file @
1974b8bc
...
...
@@ -108,29 +108,39 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
mem
.
fill
(
0
);
prc_offset
.
resize
(
v_cl
.
size
());
ite
=
g_opart_device
.
getGPUIterator
();
// Find the buffer bases
find_buffer_offsets
<
0
,
decltype
(
g_opart_device
.
toKernel
()),
decltype
(
prc_offset
.
toKernel
())
><<<
ite
.
wthr
,
ite
.
thr
>>>
(
g_opart_device
.
toKernel
(),(
int
*
)
mem
.
getDevicePointer
(),
prc_offset
.
toKernel
());
// Trasfer the number of offsets on CPU
mem
.
deviceToHost
();
prc_offset
.
template
deviceToHost
<
0
,
1
>();
if
(
g_opart_device
.
size
()
!=
0
)
{
g_opart_device
.
template
deviceToHost
<
0
>(
g_opart_device
.
size
()
-
1
,
g_opart_device
.
size
()
-
1
);}
int
noff
=
*
(
int
*
)
mem
.
getPointer
();
// In this case we do not have communications at all
if
(
g_opart_device
.
size
()
==
0
)
{
noff
=
-
1
;}
// create the terminal of prc_offset
prc_offset
.
resize
(
noff
+
1
,
DATA_ON_DEVICE
);
prc_offset
.
resize
(
noff
+
1
);
// Move the last processor index on device (id)
if
(
g_opart_device
.
size
()
!=
0
)
{
g_opart_device
.
template
deviceToHost
<
0
>(
g_opart_device
.
size
()
-
1
,
g_opart_device
.
size
()
-
1
);}
prc_offset
.
template
get
<
0
>(
prc_offset
.
size
()
-
1
)
=
g_opart_device
.
size
();
if
(
g_opart_device
.
size
()
!=
0
)
{
prc_offset
.
template
get
<
1
>(
prc_offset
.
size
()
-
1
)
=
g_opart_device
.
template
get
<
0
>(
g_opart_device
.
size
()
-
1
);}
else
{
prc_offset
.
template
get
<
1
>(
prc_offset
.
size
()
-
1
)
=
0
;}
prc_offset
.
template
hostToDevice
<
0
,
1
>(
prc_offset
.
size
()
-
1
,
prc_offset
.
size
()
-
1
);
// Here we reorder the offsets in ascending order
mergesort
((
int
*
)
prc_offset
.
template
getDeviceBuffer
<
0
>(),(
int
*
)
prc_offset
.
template
getDeviceBuffer
<
1
>(),
prc_offset
.
size
(),
mgpu
::
template
less_t
<
int
>(),
v_cl
.
getmgpuContext
());
prc_offset
.
template
deviceToHost
<
0
,
1
>();
// In this case we do not have communications at all
if
(
g_opart_device
.
size
()
==
0
)
{
noff
=
-
1
;}
prc
.
resize
(
noff
+
1
);
prc_sz
.
resize
(
noff
+
1
);
...
...
@@ -236,9 +246,9 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
auto
ite
=
v_pos
.
getGPUIteratorTo
(
g_m
);
// label particle processor
num_shift_ghost_each_part
<
dim
,
St
,
decltype
(
box_f_dev
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
o_part_loc
.
toKernel
())
>
num_shift_ghost_each_part
<
dim
,
St
,
decltype
(
box_f_dev
.
toKernel
()),
decltype
(
box_f_sv
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
o_part_loc
.
toKernel
())
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
box_f_dev
.
toKernel
(),
v_pos
.
toKernel
(),
o_part_loc
.
toKernel
());
(
box_f_dev
.
toKernel
(),
box_f_sv
.
toKernel
(),
v_pos
.
toKernel
(),
o_part_loc
.
toKernel
()
,
g_m
);
starts
.
resize
(
o_part_loc
.
size
());
mgpu
::
scan
((
unsigned
int
*
)
o_part_loc
.
template
getDeviceBuffer
<
0
>(),
o_part_loc
.
size
(),
(
unsigned
int
*
)
starts
.
template
getDeviceBuffer
<
0
>()
,
v_cl
.
getmgpuContext
());
...
...
src/Vector/cuda/vector_dist_cuda_func_test.cu
View file @
1974b8bc
...
...
@@ -98,9 +98,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
v_prp
.
hostToDevice
<
0
,
1
,
2
>
();
// label particle processor
num_shift_ghost_each_part
<
3
,
float
,
decltype
(
box_f_dev
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
o_part_loc
.
toKernel
())
>
num_shift_ghost_each_part
<
3
,
float
,
decltype
(
box_f_dev
.
toKernel
()),
decltype
(
box_f_sv
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
o_part_loc
.
toKernel
())
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
box_f_dev
.
toKernel
(),
v_pos
.
toKernel
(),
o_part_loc
.
toKernel
());
(
box_f_dev
.
toKernel
(),
box_f_sv
.
toKernel
(),
v_pos
.
toKernel
(),
o_part_loc
.
toKernel
()
,
v_pos
.
size
()
);
o_part_loc
.
deviceToHost
<
0
>
();
...
...
@@ -734,11 +734,16 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
proc_id_out
.
resize
(
vg
.
size
());
openfpm
::
vector_gpu
<
aggregate
<
int
,
int
,
int
>>
dev_counter
;
dev_counter
.
resize
(
10
);
dev_counter
.
fill
<
0
>
(
0
);
dev_counter
.
fill
<
1
>
(
0
);
dev_counter
.
fill
<
2
>
(
0
);
process_id_proc_each_part
<
3
,
float
,
decltype
(
dec
.
toKernel
()),
decltype
(
vg
.
toKernel
()),
decltype
(
proc_id_out
.
toKernel
()),
decltype
(
dev_counter
.
toKernel
())
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
dec
.
toKernel
(),
vg
.
toKernel
(),
proc_id_out
.
toKernel
(),
dev_counter
.
toKernel
(),
v_cl
.
rank
());
proc_id_out
.
deviceToHost
<
0
>
();
bool
match
=
true
;
...
...
@@ -852,33 +857,33 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
for
(
int
i
=
0
;
i
<
10000
;
i
++
)
// <------ particle id
{
v_pos
.
template
get
<
0
>(
i
)[
0
]
=
i
;
v_pos
.
template
get
<
0
>(
i
)[
1
]
=
i
+
10000
;
v_pos
.
template
get
<
0
>(
i
)[
2
]
=
i
+
20000
;
v_pos
_out
.
template
get
<
0
>(
i
)[
0
]
=
i
;
v_pos
_out
.
template
get
<
0
>(
i
)[
1
]
=
i
+
10000
;
v_pos
_out
.
template
get
<
0
>(
i
)[
2
]
=
i
+
20000
;
v_prp
.
template
get
<
0
>(
i
)[
0
]
=
i
+
60123
;
v_prp
.
template
get
<
0
>(
i
)[
1
]
=
i
+
73543
;
v_prp
.
template
get
<
0
>(
i
)[
2
]
=
i
+
82432
;
v_prp
_out
.
template
get
<
0
>(
i
)[
0
]
=
i
+
60123
;
v_prp
_out
.
template
get
<
0
>(
i
)[
1
]
=
i
+
73543
;
v_prp
_out
.
template
get
<
0
>(
i
)[
2
]
=
i
+
82432
;
v_prp
.
template
get
<
1
>(
i
)[
0
]
=
i
+
80123
;
v_prp
.
template
get
<
1
>(
i
)[
1
]
=
i
+
93543
;
v_prp
.
template
get
<
1
>(
i
)[
2
]
=
i
+
102432
;
v_prp
_out
.
template
get
<
1
>(
i
)[
0
]
=
i
+
80123
;
v_prp
_out
.
template
get
<
1
>(
i
)[
1
]
=
i
+
93543
;
v_prp
_out
.
template
get
<
1
>(
i
)[
2
]
=
i
+
102432
;
v_prp
.
template
get
<
2
>(
i
)[
0
]
=
i
+
110123
;
v_prp
.
template
get
<
2
>(
i
)[
1
]
=
i
+
123543
;
v_prp
.
template
get
<
2
>(
i
)[
2
]
=
i
+
132432
;
v_prp
_out
.
template
get
<
2
>(
i
)[
0
]
=
i
+
110123
;
v_prp
_out
.
template
get
<
2
>(
i
)[
1
]
=
i
+
123543
;
v_prp
_out
.
template
get
<
2
>(
i
)[
2
]
=
i
+
132432
;
v_prp
_out
.
template
get
<
0
>(
i
)[
0
]
=
0
;
v_prp
_out
.
template
get
<
0
>(
i
)[
1
]
=
0
;
v_prp
_out
.
template
get
<
0
>(
i
)[
2
]
=
0
;
v_prp
.
template
get
<
0
>(
i
)[
0
]
=
0
;
v_prp
.
template
get
<
0
>(
i
)[
1
]
=
0
;
v_prp
.
template
get
<
0
>(
i
)[
2
]
=
0
;
v_prp
_out
.
template
get
<
1
>(
i
)[
0
]
=
0
;
v_prp
_out
.
template
get
<
1
>(
i
)[
1
]
=
0
;
v_prp
_out
.
template
get
<
1
>(
i
)[
2
]
=
0
;
v_prp
.
template
get
<
1
>(
i
)[
0
]
=
0
;
v_prp
.
template
get
<
1
>(
i
)[
1
]
=
0
;
v_prp
.
template
get
<
1
>(
i
)[
2
]
=
0
;
v_prp
_out
.
template
get
<
2
>(
i
)[
0
]
=
0
;
v_prp
_out
.
template
get
<
2
>(
i
)[
1
]
=
0
;
v_prp
_out
.
template
get
<
2
>(
i
)[
2
]
=
0
;
v_prp
.
template
get
<
2
>(
i
)[
0
]
=
0
;
v_prp
.
template
get
<
2
>(
i
)[
1
]
=
0
;
v_prp
.
template
get
<
2
>(
i
)[
2
]
=
0
;
ns_to_s
.
template
get
<
0
>(
i
)
=
10000
-
i
-
1
;
}
...
...
@@ -895,7 +900,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
v_pos_out
.
toKernel
(),
v_prp_out
.
toKernel
(),
ns_to_s
.
toKernel
());
v_prp
_out
.
template
deviceToHost
<
0
,
1
,
2
>();
v_prp
.
template
deviceToHost
<
0
,
1
,
2
>();
bool
match
=
true
;
for
(
int
i
=
0
;
i
<
10000
;
i
++
)
// <------ particle id
...
...
@@ -904,13 +909,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
match
&=
v_prp_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
1
]
==
v_prp
.
template
get
<
0
>(
i
)[
1
];
match
&=
v_prp_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
2
]
==
v_prp
.
template
get
<
0
>(
i
)[
2
];
match
&=
v_prp
_out
.
template
get
<
1
>(
10000
-
i
-
1
)[
0
]
==
0
;
match
&=
v_prp
_out
.
template
get
<
1
>(
10000
-
i
-
1
)[
1
]
==
0
;
match
&=
v_prp
_out
.
template
get
<
1
>(
10000
-
i
-
1
)[
2
]
==
0
;
match
&=
v_prp
.
template
get
<
1
>(
10000
-
i
-
1
)[
0
]
==
0
;
match
&=
v_prp
.
template
get
<
1
>(
10000
-
i
-
1
)[
1
]
==
0
;
match
&=
v_prp
.
template
get
<
1
>(
10000
-
i
-
1
)[
2
]
==
0
;
match
&=
v_prp
_out
.
template
get
<
2
>(
10000
-
i
-
1
)[
0
]
==
0
;
match
&=
v_prp
_out
.
template
get
<
2
>(
10000
-
i
-
1
)[
1
]
==
0
;
match
&=
v_prp
_out
.
template
get
<
2
>(
10000
-
i
-
1
)[
2
]
==
0
;
match
&=
v_prp
.
template
get
<
2
>(
10000
-
i
-
1
)[
0
]
==
0
;
match
&=
v_prp
.
template
get
<
2
>(
10000
-
i
-
1
)[
1
]
==
0
;
match
&=
v_prp
.
template
get
<
2
>(
10000
-
i
-
1
)[
2
]
==
0
;
}
BOOST_REQUIRE_EQUAL
(
match
,
true
);
...
...
@@ -919,8 +924,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
v_pos_out
.
toKernel
(),
v_prp_out
.
toKernel
(),
ns_to_s
.
toKernel
());
v_prp
_out
.
template
deviceToHost
<
0
,
1
,
2
>();
v_pos
_out
.
template
deviceToHost
<
0
>();
v_prp
.
template
deviceToHost
<
0
,
1
,
2
>();
v_pos
.
template
deviceToHost
<
0
>();
for
(
int
i
=
0
;
i
<
10000
;
i
++
)
// <------ particle id
{
...
...
@@ -937,9 +942,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
match
&=
v_prp_out
.
template
get
<
2
>(
10000
-
i
-
1
)[
2
]
==
v_prp
.
template
get
<
2
>(
i
)[
2
];
match
&=
v_pos
_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
0
]
==
0
;
match
&=
v_pos
_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
1
]
==
0
;
match
&=
v_pos
_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
2
]
==
0
;
match
&=
v_pos
.
template
get
<
0
>(
10000
-
i
-
1
)[
0
]
==
0
;
match
&=
v_pos
.
template
get
<
0
>(
10000
-
i
-
1
)[
1
]
==
0
;
match
&=
v_pos
.
template
get
<
0
>(
10000
-
i
-
1
)[
2
]
==
0
;
}
BOOST_REQUIRE_EQUAL
(
match
,
true
);
...
...
@@ -948,11 +953,13 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
v_pos_out
.
toKernel
(),
v_prp_out
.
toKernel
(),
ns_to_s
.
toKernel
());
v_prp
_out
.
template
deviceToHost
<
0
,
1
,
2
>();
v_pos
_out
.
template
deviceToHost
<
0
>();
v_prp
.
template
deviceToHost
<
0
,
1
,
2
>();
v_pos
.
template
deviceToHost
<
0
>();
for
(
int
i
=
0
;
i
<
10000
;
i
++
)
// <------ particle id
{
match
&=
v_prp_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
0
]
==
v_prp
.
template
get
<
0
>(
i
)[
0
];
match
&=
v_prp_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
1
]
==
v_prp
.
template
get
<
0
>(
i
)[
1
];
match
&=
v_prp_out
.
template
get
<
0
>(
10000
-
i
-
1
)[
2
]
==
v_prp
.
template
get
<
0
>(
i
)[
2
];
...
...
@@ -1010,8 +1017,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
v_prp
.
template
get
<
2
>(
i
)[
2
][
2
]
=
340.0
+
(
float
)
rand
()
/
RAND_MAX
;
int
seg
=
i
/
10000
;
m_opart
.
template
get
<
0
>(
i
)
=
seg
;
m_opart
.
template
get
<
1
>(
i
)
=
(
9999
-
i
%
10000
)
+
seg
*
10000
;
m_opart
.
template
get
<
1
>(
i
)
=
seg
;
m_opart
.
template
get
<
0
>(
i
)
=
(
9999
-
i
%
10000
)
+
seg
*
10000
;
}
m_pos
.
resize
(
10
);
...
...
@@ -1045,24 +1052,24 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
for
(
size_t
j
=
0
;
j
<
m_pos
.
get
(
i
).
size
()
;
j
++
)
{
match
&=
(
m_pos
.
get
(
i
).
template
get
<
0
>(
j
)[
0
]
==
v_pos
.
template
get
<
0
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
0
]);
match
&=
(
m_pos
.
get
(
i
).
template
get
<
0
>(
j
)[
1
]
==
v_pos
.
template
get
<
0
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
1
]);
match
&=
(
m_pos
.
get
(
i
).
template
get
<
0
>(
j
)[
2
]
==
v_pos
.
template
get
<
0
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
2
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
0
>(
j
)
==
v_prp
.
template
get
<
0
>(
m_opart
.
template
get
<
1
>(
offset
+
j
)));
match
&=
(
m_prp
.
get
(
i
).
template
get
<
1
>(
j
)[
0
]
==
v_prp
.
template
get
<
1
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
1
>(
j
)[
1
]
==
v_prp
.
template
get
<
1
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
0
][
0
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
0
][
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
0
][
1
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
0
][
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
0
][
2
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
0
][
2
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
1
][
0
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
1
][
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
1
][
1
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
1
][
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
1
][
2
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
1
][
2
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
2
][
0
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
2
][
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
2
][
1
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
2
][
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
2
][
2
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
1
>(
offset
+
j
))[
2
][
2
]);
match
&=
(
m_pos
.
get
(
i
).
template
get
<
0
>(
j
)[
0
]
==
v_pos
.
template
get
<
0
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
0
]);
match
&=
(
m_pos
.
get
(
i
).
template
get
<
0
>(
j
)[
1
]
==
v_pos
.
template
get
<
0
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
1
]);
match
&=
(
m_pos
.
get
(
i
).
template
get
<
0
>(
j
)[
2
]
==
v_pos
.
template
get
<
0
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
2
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
0
>(
j
)
==
v_prp
.
template
get
<
0
>(
m_opart
.
template
get
<
0
>(
offset
+
j
)));
match
&=
(
m_prp
.
get
(
i
).
template
get
<
1
>(
j
)[
0
]
==
v_prp
.
template
get
<
1
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
1
>(
j
)[
1
]
==
v_prp
.
template
get
<
1
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
0
][
0
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
0
][
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
0
][
1
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
0
][
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
0
][
2
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
0
][
2
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
1
][
0
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
1
][
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
1
][
1
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
1
][
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
1
][
2
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
1
][
2
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
2
][
0
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
2
][
0
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
2
][
1
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
2
][
1
]);
match
&=
(
m_prp
.
get
(
i
).
template
get
<
2
>(
j
)[
2
][
2
]
==
v_prp
.
template
get
<
2
>(
m_opart
.
template
get
<
0
>(
offset
+
j
))[
2
][
2
]);
}
BOOST_REQUIRE_EQUAL
(
match
,
true
);
...
...
src/Vector/cuda/vector_dist_cuda_funcs.cuh
View file @
1974b8bc
...
...
@@ -63,7 +63,7 @@ __global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
vd_pos
.
template
set
<
0
>(
p
,
v_pos_ord
,
nss
.
template
get
<
0
>(
p
));
}
vd_prp
.
template
set
<
prp
...>(
p
,
vd_prp_ord
,
nss
.
template
get
<
0
>(
p
));
vd_prp
.
template
set
<
prp
...>(
p
,
vd_prp_ord
,
nss
.
template
get
<
0
>(
p
));
}
template
<
unsigned
int
dim
,
typename
St
,
typename
cartdec_gpu
,
typename
particles_type
,
typename
vector_out
,
typename
prc_sz_type
>
...
...
@@ -99,7 +99,7 @@ __global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
{
int
i
=
atomicAdd
(
cnt
,
1
);
offs
.
template
get
<
0
>(
i
)
=
p
+
1
;
offs
.
template
get
<
1
>(
i
)
=
vd
.
template
get
<
1
>(
p
);
offs
.
template
get
<
1
>(
i
)
=
vd
.
template
get
<
prp_off
>(
p
);
}
}
...
...
@@ -184,12 +184,13 @@ __global__ void process_ghost_particles_local(vector_g_opart_type g_opart, vecto
v_prp
.
set
(
base
+
i
,
v_prp
.
get
(
pid
));
}
template
<
unsigned
int
dim
,
typename
St
,
typename
vector_of_box
,
typename
vector_type
,
typename
output_type
>
__global__
void
num_shift_ghost_each_part
(
vector_of_box
box_f
,
vector_type
vd
,
output_type
out
)
template
<
unsigned
int
dim
,
typename
St
,
typename
vector_of_box
,
typename
vector_of_shifts
,
typename
vector_type
,
typename
output_type
>
__global__
void
num_shift_ghost_each_part
(
vector_of_box
box_f
,
vector_of_shifts
box_f_sv
,
vector_type
vd
,
output_type
out
,
unsigned
int
g_m
)
{
unsigned
int
old_shift
=
(
unsigned
int
)
-
1
;
int
p
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
p
>=
vd
.
size
()
)
return
;
if
(
p
>=
g_m
)
return
;
Point
<
dim
,
St
>
xp
=
vd
.
template
get
<
0
>(
p
);
...
...
@@ -197,8 +198,14 @@ __global__ void num_shift_ghost_each_part(vector_of_box box_f, vector_type vd,
for
(
unsigned
int
i
=
0
;
i
<
box_f
.
size
()
;
i
++
)
{
if
(
Box
<
dim
,
St
>
(
box_f
.
get
(
i
)).
isInsideNP
(
xp
)
==
true
)
{
n
++
;}
unsigned
int
shift_actual
=
box_f_sv
.
template
get
<
0
>(
i
);
bool
sw
=
(
old_shift
==
shift_actual
)
?
true
:
false
;
if
(
Box
<
dim
,
St
>
(
box_f
.
get
(
i
)).
isInsideNP
(
xp
)
==
true
&&
sw
==
false
)
{
old_shift
=
shift_actual
;
n
++
;
}
}
out
.
template
get
<
0
>(
p
)
=
n
;
...
...
@@ -217,6 +224,7 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
start_type
start
,
shifts_type
shifts
,
output_type
output
,
unsigned
int
offset
)
{
unsigned
int
old_shift
=
(
unsigned
int
)
-
1
;
int
p
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
p
>=
v_pos
.
size
())
return
;
...
...
@@ -231,20 +239,23 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
for
(
unsigned
int
i
=
0
;
i
<
box_f
.
size
()
;
i
++
)
{
if
(
Box
<
dim
,
St
>
(
box_f
.
get
(
i
)).
isInsideNP
(
xp
)
==
true
)
unsigned
int
shift_actual
=
box_f_sv
.
template
get
<
0
>(
i
);
bool
sw
=
(
old_shift
==
shift_actual
)
?
true
:
false
;
if
(
Box
<
dim
,
St
>
(
box_f
.
get
(
i
)).
isInsideNP
(
xp
)
==
true
&&
sw
==
false
)
{
unsigned
int
shift_id
=
box_f_sv
.
template
get
<
0
>(
i
);
#pragma unroll
for
(
unsigned
int
j
=
0
;
j
<
dim
;
j
++
)
{
v_pos
.
template
get
<
0
>(
base
+
n
)[
j
]
=
xp
.
get
(
j
)
-
shifts
.
template
get
<
0
>(
shift_
id
)[
j
];
v_pos
.
template
get
<
0
>(
base
+
n
)[
j
]
=
xp
.
get
(
j
)
-
shifts
.
template
get
<
0
>(
shift_
actual
)[
j
];
output
.
template
get
<
0
>(
base_o
+
n
)
=
p
;
output
.
template
get
<
1
>(
base_o
+
n
)
=
shift_
id
;
output
.
template
get
<
1
>(
base_o
+
n
)
=
shift_
actual
;
}
v_prp
.
set
(
base
+
n
,
v_prp
.
get
(
p
));
old_shift
=
shift_actual
;
n
++
;
}
}
...
...
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
View file @
1974b8bc
...
...
@@ -50,8 +50,6 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
auto
it
=
cl
.
getNNIterator
(
cl
.
getCell
(
xp
));
auto
cell
=
cl
.
getCell
(
xp
);
Point
<
3
,
float
>
force1
({
0.0
,
0.0
,
0.0
});
Point
<
3
,
float
>
force2
({
0.0
,
0.0
,
0.0
});
...
...
@@ -95,14 +93,10 @@ __global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f
unsigned
int
p
;
GET_PARTICLE_SORT
(
p
,
cl
);
unsigned
int
ns_id
=
cl
.
getSortToNonSort
().
template
get
<
0
>(
p
);
Point
<
3
,
float
>
xp
=
vd
.
getPos
(
p
);
auto
it
=
cl
.
getNNIterator
(
cl
.
getCell
(
xp
));
auto
cell
=
cl
.
getCell
(
xp
);
Point
<
3
,
float
>
force1
({
0.0
,
0.0
,
0.0
});
while
(
it
.
isNext
())
...
...
@@ -159,6 +153,12 @@ bool check_force(CellList_type & NN_cpu, vector_type & vd)
// Normalize
if
(
r2
.
norm
()
==
0
)
{
int
debug
=
0
;
debug
++
;
}
r2
/=
r2
.
norm
();
force
+=
vd
.
template
getProp
<
0
>(
q
)
*
r2
;
...
...
@@ -314,15 +314,23 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
vector_dist_gpu
<
3
,
float
,
aggregate
<
float
,
float
[
3
],
float
[
3
]
>>
vd
(
10000
,
domain
,
bc
,
g
);
srand
(
55067
*
create_vcluster
().
rank
());
auto
it
=
vd
.
getDomainIterator
();
while
(
it
.
isNext
())
{
auto
p
=
it
.
get
();
vd
.
getPos
(
p
)[
0
]
=
(
float
)
rand
()
/
RAND_MAX
;
vd
.
getPos
(
p
)[
1
]
=
(
float
)
rand
()
/
RAND_MAX
;
vd
.
getPos
(
p
)[
2
]
=
(
float
)
rand
()
/
RAND_MAX
;
int
x
=
rand
();
int
y
=
rand
();
int
z
=
rand
();
vd
.
getPos
(
p
)[
0
]
=
(
float
)
x
/
RAND_MAX
;
vd
.
getPos
(
p
)[
1
]
=
(
float
)
y
/
RAND_MAX
;
vd
.
getPos
(
p
)[
2
]
=
(
float
)
z
/
RAND_MAX
;
Point
<
3
,
float
>
xp
=
vd
.
getPos
(
p
);
++
it
;
}
...
...
@@ -358,8 +366,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
BOOST_REQUIRE_EQUAL
(
noOut
,
true
);
BOOST_REQUIRE_EQUAL
(
cnt
,
vd
.
size_local
());
vd
.
write
(
"test_out_gpu"
);
// now we offload all the properties
auto
it3
=
vd
.
getDomainIteratorGPU
();
...
...
@@ -426,6 +432,7 @@ void vdist_calc_gpu_test()
vector_dist_gpu
<
3
,
St
,
aggregate
<
St
,
St
[
3
],
St
[
3
]
>>
vd
(
1000
,
domain
,
bc
,
g
);
srand
(
v_cl
.
rank
()
*
10000
);
auto
it
=
vd
.
getDomainIterator
();
while
(
it
.
isNext
())
...
...
@@ -459,8 +466,6 @@ void vdist_calc_gpu_test()
vd
.
deviceToHostPos
();
vd
.
template
deviceToHostProp
<
0
,
1
,
2
>();
vd
.
write
(
"write_start"
);
// Reset the host part
auto
it3
=
vd
.
getDomainIterator
();
...
...
@@ -542,9 +547,6 @@ void vdist_calc_gpu_test()
vd
.
deviceToHostPos
();