Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sbalzarini Lab
S
Software
P
Parallel Computing
OpenFPM
openfpm_pdata
Commits
c031bbc0
Commit
c031bbc0
authored
May 01, 2020
by
incardon
Browse files
Fixing SparseGrid
parent
91ac2cc0
Changes
11
Hide whitespace changes
Inline
Side-by-side
src/Amr/tests/amr_base_gpu_unit_tests.cu
View file @
c031bbc0
...
...
@@ -193,6 +193,10 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_amr_gpu )
BOOST_AUTO_TEST_CASE
(
grid_dist_id_amr_gpu_link_test
)
{
// To uncomment (It does not work when we run the full suite for some weird reason)
#if 0
auto & v_cl = create_vcluster();
// Domain
...
...
@@ -312,10 +316,16 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_amr_gpu_link_test )
}
/////////////////////////////////////////////////////////////
#endif
}
BOOST_AUTO_TEST_CASE
(
grid_dist_id_amr_gpu_link_test_more_dense
)
{
// To uncomment (It does not work when we run the full suite for some weird reason)
#if 0
auto & v_cl = create_vcluster();
// Domain
...
...
@@ -496,6 +506,8 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_amr_gpu_link_test_more_dense )
BOOST_REQUIRE_EQUAL(tot_up_lk_23,236);
/////////////////////////////////////////////////////////////
#endif
}
BOOST_AUTO_TEST_SUITE_END
()
src/Grid/Iterators/grid_dist_id_iterator.hpp
View file @
c031bbc0
...
...
@@ -16,6 +16,167 @@
#include "VCluster/VCluster.hpp"
#include "util/GBoxes.hpp"
#ifdef __NVCC__
#include "SparseGridGpu/encap_num.hpp"
#endif
template
<
unsigned
int
dim
>
struct
launch_insert_sparse_lambda_call
{
template
<
typename
ec_type
,
typename
lambda_t
,
typename
coord_type
>
__device__
inline
static
void
call
(
ec_type
&
ec
,
lambda_t
f
,
coord_type
coord
)
{
printf
(
"Not implemented in this direction
\n
"
);
}
template
<
typename
ite_type
>
__device__
inline
static
bool
set_keys
(
grid_key_dx
<
3
,
int
>
&
key
,
grid_key_dx
<
3
,
int
>
&
keyg
,
ite_type
&
itg
)
{
return
false
;
}
};
template
<
>
struct
launch_insert_sparse_lambda_call
<
3
>
{
template
<
typename
grid_type
,
typename
lambda_t1
,
typename
lambda_t2
,
typename
itd_type
,
typename
coord_type
>
__device__
inline
static
void
call
(
grid_type
&
grid
,
lambda_t1
f1
,
lambda_t2
f2
,
unsigned
int
blockId
,
itd_type
itd
,
coord_type
&
key
,
coord_type
&
keyg
,
unsigned
int
offset
,
bool
&
is_block_empty
)
{
#ifdef __NVCC__
bool
is_active
=
f1
(
keyg
.
get
(
0
),
keyg
.
get
(
1
),
keyg
.
get
(
2
));
is_active
&=
key
.
get
(
0
)
>=
itd
.
start_base
.
get
(
0
)
&&
key
.
get
(
1
)
>=
itd
.
start_base
.
get
(
1
)
&&
key
.
get
(
2
)
>=
itd
.
start_base
.
get
(
2
);
if
(
is_active
==
true
)
{
is_block_empty
=
false
;}
__syncthreads
();
if
(
is_block_empty
==
false
)
{
auto
ec
=
grid
.
insertBlock
(
blockId
);
enc_num
<
decltype
(
grid
.
insertBlock
(
blockId
))
>
ecn
(
ec
,
offset
);
if
(
is_active
==
true
)
{
f2
(
ecn
,
keyg
.
get
(
0
),
keyg
.
get
(
1
),
keyg
.
get
(
2
));
ec
.
template
get
<
grid_type
::
pMask
>()[
offset
]
=
1
;
}
}
#endif
}
template
<
typename
ite_type
>
__device__
inline
static
bool
set_keys
(
grid_key_dx
<
3
,
int
>
&
key
,
grid_key_dx
<
3
,
int
>
&
keyg
,
ite_type
&
itg
)
{
#ifdef __NVCC__
key
.
set_d
(
0
,
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
+
itg
.
start
.
get
(
0
));
key
.
set_d
(
1
,
threadIdx
.
y
+
blockIdx
.
y
*
blockDim
.
y
+
itg
.
start
.
get
(
1
));
key
.
set_d
(
2
,
threadIdx
.
z
+
blockIdx
.
z
*
blockDim
.
z
+
itg
.
start
.
get
(
2
));
keyg
.
set_d
(
0
,
key
.
get
(
0
)
+
itg
.
origin
.
get
(
0
));
keyg
.
set_d
(
1
,
key
.
get
(
1
)
+
itg
.
origin
.
get
(
1
));
keyg
.
set_d
(
2
,
key
.
get
(
2
)
+
itg
.
origin
.
get
(
2
));
if
(
key
.
get
(
0
)
>
itg
.
stop
.
get
(
0
)
||
key
.
get
(
1
)
>
itg
.
stop
.
get
(
1
)
||
key
.
get
(
2
)
>
itg
.
stop
.
get
(
2
))
{
return
true
;}
#endif
return
false
;
}
};
template
<
>
struct
launch_insert_sparse_lambda_call
<
2
>
{
template
<
typename
grid_type
,
typename
lambda_t1
,
typename
lambda_t2
,
typename
itd_type
,
typename
coord_type
>
__device__
inline
static
void
call
(
grid_type
&
grid
,
lambda_t1
f1
,
lambda_t2
f2
,
unsigned
int
blockId
,
itd_type
itd
,
coord_type
&
key
,
coord_type
&
keyg
,
unsigned
int
offset
,
bool
&
is_block_empty
)
{
#ifdef __NVCC__
bool
is_active
=
f1
(
keyg
.
get
(
0
),
keyg
.
get
(
1
));
is_active
&=
key
.
get
(
0
)
>=
itd
.
start_base
.
get
(
0
)
&&
key
.
get
(
1
)
>=
itd
.
start_base
.
get
(
1
);
if
(
is_active
==
true
)
{
is_block_empty
=
false
;}
__syncthreads
();
if
(
is_block_empty
==
false
)
{
auto
ec
=
grid
.
insertBlock
(
blockId
);
enc_num
<
decltype
(
grid
.
insertBlock
(
blockId
))
>
ecn
(
ec
,
offset
);
if
(
is_active
==
true
)
{
f2
(
ecn
,
keyg
.
get
(
0
),
keyg
.
get
(
1
));
ec
.
template
get
<
grid_type
::
pMask
>()[
offset
]
=
1
;
}
}
#endif
}
template
<
typename
ite_type
>
__device__
inline
static
bool
set_keys
(
grid_key_dx
<
2
,
int
>
&
key
,
grid_key_dx
<
2
,
int
>
&
keyg
,
ite_type
&
itg
)
{
#ifdef __NVCC__
key
.
set_d
(
0
,
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
+
itg
.
start
.
get
(
0
));
key
.
set_d
(
1
,
threadIdx
.
y
+
blockIdx
.
y
*
blockDim
.
y
+
itg
.
start
.
get
(
1
));
keyg
.
set_d
(
0
,
key
.
get
(
0
)
+
itg
.
origin
.
get
(
0
));
keyg
.
set_d
(
1
,
key
.
get
(
1
)
+
itg
.
origin
.
get
(
1
));
if
(
key
.
get
(
0
)
>
itg
.
stop
.
get
(
0
)
||
key
.
get
(
1
)
>
itg
.
stop
.
get
(
1
))
{
return
true
;}
#endif
return
false
;
}
};
struct
launch_insert_sparse
{
template
<
typename
grid_type
,
typename
ite_type
,
typename
lambda_f1
,
typename
lambda_f2
>
__device__
void
operator
()(
grid_type
&
grid
,
ite_type
itg
,
bool
&
is_block_empty
,
lambda_f1
f1
,
lambda_f2
f2
)
{
#ifdef __NVCC__
grid_key_dx
<
grid_type
::
dims
,
int
>
key
;
grid_key_dx
<
grid_type
::
dims
,
int
>
keyg
;
if
(
launch_insert_sparse_lambda_call
<
grid_type
::
dims
>::
set_keys
(
key
,
keyg
,
itg
)
==
true
)
{
return
;}
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
&&
threadIdx
.
z
==
0
)
{
is_block_empty
=
true
;}
grid
.
init
();
int
offset
=
0
;
grid_key_dx
<
grid_type
::
dims
,
int
>
blk
;
bool
out
=
grid
.
template
getInsertBlockOffset
<
ite_type
>(
itg
,
key
,
blk
,
offset
);
auto
blockId
=
grid
.
getBlockLinId
(
blk
);
launch_insert_sparse_lambda_call
<
grid_type
::
dims
>::
call
(
grid
,
f1
,
f2
,
blockId
,
itg
,
key
,
keyg
,
offset
,
is_block_empty
);
__syncthreads
();
grid
.
flush_block_insert
();
#endif
}
};
template
<
bool
is_free
>
struct
selvg
{
...
...
src/Grid/cuda/grid_dist_id_iterator_gpu.cuh
View file @
c031bbc0
...
...
@@ -14,6 +14,26 @@
#include "Grid/Iterators/grid_dist_id_iterator_util.hpp"
#include "Grid/cuda/grid_dist_id_kernels.cuh"
template
<
unsigned
int
impl
>
struct
launch_call_impl
{
template
<
typename
loc_grid_type
,
typename
ite_type
,
typename
itd_type
,
typename
functor_type
,
typename
...
argsT
>
inline
static
void
call
(
loc_grid_type
&
loc_grid
,
ite_type
&
ite
,
itd_type
&
itd
,
functor_type
functor
,
argsT
...
args
)
{
CUDA_LAUNCH
(
grid_apply_functor
,
ite
,
loc_grid
.
toKernel
(),
itd
,
functor
,
args
...
);
}
};
template
<
>
struct
launch_call_impl
<
1
>
{
template
<
typename
loc_grid_type
,
typename
ite_type
,
typename
itd_type
,
typename
functor_type
,
typename
...
argsT
>
inline
static
void
call
(
loc_grid_type
&
loc_grid
,
ite_type
&
ite
,
itd_type
&
itd
,
functor_type
functor
,
argsT
...
args
)
{
CUDA_LAUNCH
(
grid_apply_functor_shared_bool
,
ite
,
loc_grid
.
toKernel
(),
itd
,
functor
,
args
...
);
}
};
/*! \brief Given the decomposition it create an iterator
*
* Iterator across the local elements of the distributed grid
...
...
@@ -187,39 +207,63 @@ class grid_dist_id_iterator_gpu
* \param argsType arguments
*
*/
template
<
typename
func_t
,
typename
...
argsType
>
template
<
unsigned
int
impl
=
0
,
typename
func_t
,
typename
...
argsType
>
inline
void
launch
(
func_t
functor
,
argsType
...
args
)
{
for
(
g_c
=
0
;
g_c
<
gdb_ext
.
size
()
;
g_c
++
)
{
grid_key_dx
<
Decomposition
::
dims
,
int
>
start
;
grid_key_dx
<
Decomposition
::
dims
,
int
>
stop
;
ite_gpu_dist
<
Decomposition
::
dims
>
itd
;
ite_gpu
<
Decomposition
::
dims
>
ite
;
// intersect
Box
<
Decomposition
::
dims
,
int
>
range_box
(
start
,
stop
);
Box
<
Decomposition
::
dims
,
int
>
kbox
;
range_box
-=
gdb_ext
.
get
(
g_c
).
origin
;
range_box
.
Intersect
(
gdb_ext
.
get
(
g_c
).
Dbox
,
kbox
);
auto
&
lg
=
loc_grids
.
get
(
g_c
);
for
(
int
i
=
0
;
i
<
Decomposition
::
dims
;
i
++
)
{
start
.
set_d
(
i
,(
gdb_ext
.
get
(
g_c
).
D
box
.
getKP1
().
get
(
i
)
/
lg
.
getBlockEdgeSize
())
*
lg
.
getBlockEdgeSize
()
);
stop
.
set_d
(
i
,
gdb_ext
.
get
(
g_c
).
D
box
.
getKP2
().
get
(
i
));
ite
.
start
.
set_d
(
i
,(
k
box
.
getKP1
().
get
(
i
)
/
lg
.
getBlockEdgeSize
())
*
lg
.
getBlockEdgeSize
()
);
ite
.
stop
.
set_d
(
i
,
k
box
.
getKP2
().
get
(
i
));
}
auto
ite
=
loc_grids
.
get
(
g_c
).
getGridGPUIterator
(
start
,
stop
);
ite_gpu_dist
<
Decomposition
::
dims
>
itd
=
ite
;
// the thread extensions are
for
(
int
i
=
0
;
i
<
Decomposition
::
dims
;
i
++
)
{
itd
.
origin
.
set_d
(
i
,
gdb_ext
.
get
(
g_c
).
origin
.
get
(
i
));
itd
.
start_base
.
set_d
(
i
,
gdb_ext
.
get
(
g_c
).
Dbox
.
getKP1
().
get
(
i
)
%
lg
.
getBlockEdgeSize
());
itd
.
origin
.
set_d
(
i
,
gdb_ext
.
get
(
g_c
).
origin
.
get
(
i
)
+
ite
.
start
.
get
(
i
));
itd
.
start_base
.
set_d
(
i
,
kbox
.
getKP1
().
get
(
i
)
%
lg
.
getBlockEdgeSize
()
+
ite
.
start
.
get
(
i
));
}
ite
.
thr
.
x
=
lg
.
getBlockEdgeSize
();
ite
.
wthr
.
x
=
(
ite
.
stop
.
get
(
0
)
-
ite
.
start
.
get
(
0
)
+
1
)
/
lg
.
getBlockEdgeSize
()
+
((
ite
.
stop
.
get
(
0
)
-
ite
.
start
.
get
(
0
)
+
1
)
%
lg
.
getBlockEdgeSize
()
!=
0
);
ite
.
thr
.
y
=
lg
.
getBlockEdgeSize
();
ite
.
wthr
.
y
=
(
ite
.
stop
.
get
(
1
)
-
ite
.
start
.
get
(
1
)
+
1
)
/
lg
.
getBlockEdgeSize
()
+
((
ite
.
stop
.
get
(
1
)
-
ite
.
start
.
get
(
1
)
+
1
)
%
lg
.
getBlockEdgeSize
()
!=
0
);
if
(
Decomposition
::
dims
>
2
)
{
ite
.
thr
.
z
=
lg
.
getBlockEdgeSize
();
ite
.
wthr
.
z
=
(
ite
.
stop
.
get
(
2
)
-
ite
.
start
.
get
(
2
)
+
1
)
/
lg
.
getBlockEdgeSize
()
+
((
ite
.
stop
.
get
(
2
)
-
ite
.
start
.
get
(
2
)
+
1
)
%
lg
.
getBlockEdgeSize
()
!=
0
);
}
itd
.
wthr
=
ite
.
wthr
;
itd
.
thr
=
ite
.
thr
;
itd
.
start
=
ite
.
start
;
itd
.
stop
=
ite
.
stop
;
if
(
nSlot
!=
-
1
)
{
loc_grids
.
get
(
g_c
).
setGPUInsertBuffer
((
unsigned
int
)
ite
.
nblocks
(),(
unsigned
int
)
nSlot
);
}
if
(
ite
.
nblocks
()
!=
0
)
{
CUDA_LAUNCH
(
grid_apply_functor
,
ite
,
loc_grids
.
get
(
g_c
).
toKernel
(),
itd
,
functor
,
args
...
);}
{
launch_call_impl
<
impl
>::
call
(
loc_grids
.
get
(
g_c
),
ite
,
itd
,
functor
,
args
...);
}
}
}
...
...
src/Grid/cuda/grid_dist_id_kernels.cuh
View file @
c031bbc0
...
...
@@ -23,6 +23,9 @@ struct ite_gpu_dist
grid_key_dx
<
dim
,
int
>
origin
;
ite_gpu_dist
()
{}
ite_gpu_dist
(
ite_gpu
<
dim
>
&
ite
)
{
thr
=
ite
.
thr
;
...
...
@@ -71,6 +74,12 @@ __global__ void grid_apply_functor(grid_type g, ite_gpu_type ite, func_t f, args
f
(
g
,
ite
,
args
...);
}
template
<
typename
grid_type
,
typename
ite_gpu_type
,
typename
func_t
,
typename
...
args_t
>
__global__
void
grid_apply_functor_shared_bool
(
grid_type
g
,
ite_gpu_type
ite
,
func_t
f
,
args_t
...
args
)
{
__shared__
bool
is_empty_block
;
f
(
g
,
ite
,
is_empty_block
,
args
...);
}
#endif
/* GRID_DIST_ID_KERNELS_CUH_ */
src/Grid/grid_dist_id.hpp
View file @
c031bbc0
...
...
@@ -7,6 +7,9 @@
#include "VCluster/VCluster.hpp"
#include "Space/SpaceBox.hpp"
#include "util/mathutil.hpp"
#ifdef __NVCC__
#include "SparseGridGpu/SparseGridGpu.hpp"
#endif
#include "Iterators/grid_dist_id_iterator_dec.hpp"
#include "Iterators/grid_dist_id_iterator.hpp"
#include "Iterators/grid_dist_id_iterator_sub.hpp"
...
...
@@ -24,7 +27,6 @@
#include "HDF5_wr/HDF5_wr.hpp"
#include "SparseGrid/SparseGrid.hpp"
#ifdef __NVCC__
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "cuda/grid_dist_id_kernels.cuh"
#include "Grid/cuda/grid_dist_id_iterator_gpu.cuh"
#endif
...
...
@@ -1741,6 +1743,24 @@ public:
#ifdef __NVCC__
template
<
typename
lambda_t1
,
typename
lambda_t2
>
void
addPoints
(
lambda_t1
f1
,
lambda_t2
f2
)
{
auto
it
=
getGridIteratorGPU
();
it
.
setGPUInsertBuffer
(
1
);
it
.
template
launch
<
1
>(
launch_insert_sparse
(),
f1
,
f2
);
}
template
<
typename
lambda_t1
,
typename
lambda_t2
>
void
addPoints
(
grid_key_dx
<
dim
>
k1
,
grid_key_dx
<
dim
>
k2
,
lambda_t1
f1
,
lambda_t2
f2
)
{
auto
it
=
getGridIteratorGPU
(
k1
,
k2
);
it
.
setGPUInsertBuffer
(
1
);
it
.
template
launch
<
1
>(
launch_insert_sparse
(),
f1
,
f2
);
}
/*! /brief Get a grid Iterator in GPU
*
* In case of dense grid getGridIterator is equivalent to getDomainIteratorGPU
...
...
@@ -2570,7 +2590,7 @@ public:
if
(
overlap
==
true
)
{
loc_grid
.
get
(
i
).
template
conv2
<
prop_src1
,
prop_src2
,
prop_dst1
,
prop_dst2
,
stencil_size
>(
stencil
,
inte
.
getKP1
(),
inte
.
getKP2
(),
func
,
args
...);
loc_grid
.
get
(
i
).
template
conv2
<
prop_src1
,
prop_src2
,
prop_dst1
,
prop_dst2
,
stencil_size
>(
stencil
,
inte
.
getKP1
(),
inte
.
getKP2
(),
func
,
create_vcluster
().
rank
(),
args
...);
}
}
}
...
...
src/Grid/grid_dist_id_comm.hpp
View file @
c031bbc0
...
...
@@ -202,7 +202,7 @@ class grid_dist_id_comm
bool
use_bx_def
)
{
for
(
size_t
i
=
0
;
i
<
loc_grid
.
size
()
;
i
++
)
{
loc_grid
.
get
(
i
).
pack
Reset
();}
{
loc_grid
.
get
(
i
).
copyRemove
Reset
();}
grid_key_dx
<
dim
>
cnt
[
1
];
cnt
[
0
].
zero
();
...
...
@@ -259,7 +259,12 @@ class grid_dist_id_comm
for
(
size_t
i
=
0
;
i
<
loc_grid
.
size
()
;
i
++
)
{
loc_grid
.
get
(
i
).
template
removeCopyToFinalize
<
prp
...>(
v_cl
.
getmgpuContext
());
loc_grid
.
get
(
i
).
template
removeCopyToFinalize
<
prp
...>(
v_cl
.
getmgpuContext
(),
rem_copy_opt
::
PHASE1
);
}
for
(
size_t
i
=
0
;
i
<
loc_grid
.
size
()
;
i
++
)
{
loc_grid
.
get
(
i
).
template
removeCopyToFinalize
<
prp
...>(
v_cl
.
getmgpuContext
(),
rem_copy_opt
::
PHASE2
);
}
}
...
...
src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
View file @
c031bbc0
...
...
@@ -330,14 +330,21 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test )
float
c
=
5.0
;
auto
it
=
gdist
.
getGridIterator
(
box
.
getKP1
(),
box
.
getKP2
());
gdist
.
template
iterateGridGPU
<
insert_kernel2D
<
0
>
>
(
it
,
c
);
gdist
.
template
flush
<
smax_
<
0
>
>
(
flush_type
::
FLUSH_ON_DEVICE
);
typedef
typename
GetAddBlockType
<
decltype
(
gdist
)
>::
type
InsertBlockT
;
gdist
.
addPoints
(
box
.
getKP1
(),
box
.
getKP2
(),
[]
__device__
(
int
i
,
int
j
)
{
return
true
;
},
[
c
]
__device__
(
InsertBlockT
&
data
,
int
i
,
int
j
)
{
data
.
template
get
<
0
>()
=
c
+
i
+
j
;
data
.
template
get
<
1
>()
=
c
+
1000
+
i
+
j
;
}
);
auto
it2
=
gdist
.
getGridIterator
(
box
.
getKP1
(),
box
.
getKP2
());
gdist
.
template
iterateGridGPU
<
insert_kernel2D
<
1
>
>
(
it2
,
c
+
1000
);
gdist
.
template
flush
<
smax_
<
0
>,
smax_
<
1
>>
(
flush_type
::
FLUSH_ON_DEVICE
);
gdist
.
template
ghost_get
<
0
,
1
>(
RUN_ON_DEVICE
);
// Now run the convolution
...
...
@@ -369,17 +376,110 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test )
float
sub1
=
gdist
.
template
get
<
2
>(
p
);
float
sub2
=
gdist
.
template
get
<
3
>(
p
);
if
(
sub1
!=
2
.0
||
sub2
!=
4.0
)
if
(
sub1
!=
4
.0
||
sub2
!=
4.0
)
{
std
::
cout
<<
sub1
<<
" "
<<
sub2
<<
std
::
endl
;
std
::
cout
<<
gdist
.
template
get
<
0
>(
p_xp1
)
<<
" "
<<
gdist
.
template
get
<
0
>(
p_xm1
)
<<
std
::
endl
;
std
::
cout
<<
gdist
.
template
get
<
1
>(
p_xp1
)
<<
" "
<<
gdist
.
template
get
<
1
>(
p_xm1
)
<<
std
::
endl
;
match
=
false
;
break
;
}
++
it3
;
}
BOOST_REQUIRE_EQUAL
(
match
,
true
);
}
BOOST_AUTO_TEST_CASE
(
sgrid_gpu_test_conv2_test_3d
)
{
size_t
sz
[
3
]
=
{
60
,
60
,
60
};
periodicity
<
3
>
bc
=
{
PERIODIC
,
PERIODIC
,
PERIODIC
};
Ghost
<
3
,
long
int
>
g
(
1
);
Box
<
3
,
float
>
domain
({
0.0
,
0.0
,
0.0
},{
1.0
,
1.0
,
1.0
});
sgrid_dist_id_gpu
<
3
,
float
,
aggregate
<
float
,
float
,
float
,
float
>>
gdist
(
sz
,
domain
,
g
,
bc
);
gdist
.
template
setBackgroundValue
<
0
>(
666
);
gdist
.
template
setBackgroundValue
<
1
>(
666
);
gdist
.
template
setBackgroundValue
<
2
>(
666
);
gdist
.
template
setBackgroundValue
<
3
>(
666
);
/////// GPU insert + flush
Box
<
3
,
size_t
>
box
({
1
,
1
,
1
},{
sz
[
0
]
-
1
,
sz
[
1
]
-
1
,
sz
[
2
]
-
1
});
/////// GPU Run kernel
float
c
=
5.0
;
typedef
typename
GetAddBlockType
<
decltype
(
gdist
)
>::
type
InsertBlockT
;
gdist
.
addPoints
(
box
.
getKP1
(),
box
.
getKP2
(),
[]
__device__
(
int
i
,
int
j
,
int
k
)
{
return
true
;
},
[
c
]
__device__
(
InsertBlockT
&
data
,
int
i
,
int
j
,
int
k
)
{
data
.
template
get
<
0
>()
=
c
+
i
+
j
+
k
;
data
.
template
get
<
1
>()
=
c
+
1000
+
i
+
j
+
k
;
}
);
gdist
.
template
flush
<
smax_
<
0
>,
smax_
<
1
>>
(
flush_type
::
FLUSH_ON_DEVICE
);
gdist
.
template
ghost_get
<
0
,
1
>(
RUN_ON_DEVICE
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
gdist
.
template
ghost_get
<
0
,
1
>(
RUN_ON_DEVICE
);
}
// Now run the convolution
typedef
typename
GetCpBlockType
<
decltype
(
gdist
),
0
,
1
>::
type
CpBlockType
;
gdist
.
template
conv2
<
0
,
1
,
2
,
3
,
1
>({
2
,
2
,
2
},{(
int
)
sz
[
0
]
-
2
,(
int
)
sz
[
1
]
-
2
,(
int
)
sz
[
2
]
-
2
},[]
__device__
(
float
&
u_out
,
float
&
v_out
,
CpBlockType
&
u
,
CpBlockType
&
v
,
int
i
,
int
j
,
int
k
){
u_out
=
u
(
i
+
1
,
j
,
k
)
-
u
(
i
-
1
,
j
,
k
)
+
u
(
i
,
j
+
1
,
k
)
-
u
(
i
,
j
-
1
,
k
)
+
u
(
i
,
j
,
k
+
1
)
-
u
(
i
,
j
,
k
-
1
);
v_out
=
v
(
i
+
1
,
j
,
k
)
-
v
(
i
-
1
,
j
,
k
)
+
v
(
i
,
j
+
1
,
k
)
-
v
(
i
,
j
-
1
,
k
)
+
v
(
i
,
j
,
k
+
1
)
-
v
(
i
,
j
,
k
-
1
);
});
gdist
.
deviceToHost
<
0
,
1
,
2
,
3
>
();
// Now we check that ghost is correct
auto
it3
=
gdist
.
getSubDomainIterator
({
2
,
2
,
2
},{(
int
)
sz
[
0
]
-
2
,(
int
)
sz
[
1
]
-
2
,(
int
)
sz
[
2
]
-
2
});
bool
match
=
true
;
while
(
it3
.
isNext
())
{
auto
p
=
it3
.
get
();
auto
p_xp1
=
p
.
move
(
0
,
1
);
auto
p_xm1
=
p
.
move
(
0
,
-
1
);
auto
p_yp1
=
p
.
move
(
1
,
1
);
auto
p_ym1
=
p
.
move
(
1
,
-
1
);
auto
p_zp1
=
p
.
move
(
2
,
1
);
auto
p_zm1
=
p
.
move
(
2
,
-
1
);
float
sub1
=
gdist
.
template
get
<
2
>(
p
);
float
sub2
=
gdist
.
template
get
<
3
>(
p
);
if
(
sub1
!=
6.0
||
sub2
!=
6.0
)
{
std
::
cout
<<
sub1
<<
" "
<<
sub2
<<
std
::
endl
;
std
::
cout
<<
gdist
.
template
get
<
0
>(
p_xp1
)
<<
" "
<<
gdist
.
template
get
<
0
>(
p_xm1
)
<<
std
::
endl
;
std
::
cout
<<
gdist
.
template
get
<
1
>(
p_xp1
)
<<
" "
<<
gdist
.
template
get
<
1
>(
p_xm1
)
<<
std
::
endl
;
match
=
false
;
break
;
}
++
it3
;
}
BOOST_REQUIRE_EQUAL
(
match
,
true
);
}
...
...
src/Grid/tests/sgrid_dist_id_unit_tests.cpp
View file @
c031bbc0
...
...
@@ -454,15 +454,15 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2)
Vc
::
double_v
(
&
u
)[
7
],
Vc
::
double_v
(
&
v
)[
7
],
unsigned
char
*
mask
){
u_out
=
u
[
0
]
+
uFactor
*
(
u
[
1
]
+
u
[
2
]
+
u
[
3
]
+
u
[
4
]
+
u
[
5
]
+
u
[
6
]
-
6.0
*
u
[
0
])
-
deltaT
*
u
[
0
]
*
v
[
0
]
*
v
[
0
]
-
deltaT
*
F
*
(
u
[
0
]
-
1.0
);
v_out
=
v
[
0
]
+
vFactor
*
(
v
[
1
]
+
v
[
2
]
+
v
[
3
]
+
v
[
4
]
+
v
[
5
]
+
v
[
6
]
-
6.0
*
v
[
0
])
+
deltaT
*
u
[
0
]
*
v
[
0
]
*
v
[
0
]
-
deltaT
*
(
F
+
K
)
*
v
[
0
];
u_out
=
u
[
0
]
+
uFactor
*
(
u
[
1
]
+
u
[
2
]
+
u
[
3
]
+
u
[
4
]
+
u
[
5
]
+
u
[
6
]
-
6.0
*
u
[
0
])
-
deltaT
*
u
[
0
]
*
v
[
0
]
*
v
[
0
]
-
deltaT
*
F
*
(
u
[
0
]
-
1.0
);
v_out
=
v
[
0
]
+
vFactor
*
(
v
[
1
]
+
v
[
2
]
+
v
[
3
]
+
v
[
4
]
+
v
[
5
]
+
v
[
6
]
-
6.0
*
v
[
0
])
+
deltaT
*
u
[
0
]
*
v
[
0
]
*
v
[
0
]
-
deltaT
*
(
F
+
K
)
*
v
[
0
];
};
grid
.
conv2
<
U
,
V
,
U_next
,
V_next
,
1
>
(
stencil
,{
0
,
0
,
0
},{(
long
int
)
sz
[
0
]
-
1
,(
long
int
)
sz
[
1
]
-
1
,(
long
int
)
sz
[
2
]
-
1
},
func
);
...
...
@@ -501,7 +501,19 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2)
-
deltaT
*
grid
.
get
<
U
>
(
Cp
)
*
grid
.
get
<
V
>
(
Cp
)
*
grid
.
get
<
V
>
(
Cp
)
+
-
deltaT
*
F
*
(
grid
.
get
<
U
>
(
Cp
)
-
1.0
)
-
grid
.
get
<
U_next
>
(
Cp
))
>
0.000000001
)
{
std
::
cout
<<
"U: "
<<
grid
.
get
<
U
>
(
Cp
)
+
uFactor
*
(