Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sbalzarini Lab
S
Software
P
Parallel Computing
OpenFPM
openfpm_devices
Commits
34d81e99
Commit
34d81e99
authored
Dec 31, 2020
by
incardon
Browse files
Adding CUDA_ON_CPU
parent
9439b214
Pipeline
#2513
failed with stages
in 6 seconds
Changes
17
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
CMakeLists.txt
View file @
34d81e99
...
...
@@ -6,7 +6,7 @@ set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST")
set
(
CMAKE_CXX_STANDARD 14
)
set
(
CMAKE_CUDA_STANDARD 14
)
find_package
(
Boost 1.68.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options
fiber
)
find_package
(
Boost 1.68.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options
OPTIONAL_COMPONENTS fiber context
)
set
(
CUDA_ON_CPU CACHE BOOL
"Make Cuda work on heap"
)
...
...
@@ -34,6 +34,10 @@ if (BOOST_FOUND)
set
(
DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK
"#define HAVE_BOOST_UNIT_TEST_FRAMEWORK"
)
endif
()
if
(
ALPAKA_ROOT
)
set
(
DEFINE_HAVE_ALPAKA
"#define HAVE_ALPAKA"
)
endif
()
configure_file
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/config/config_cmake.h.in
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/config/config.h
)
add_subdirectory
(
src
)
...
...
configure
View file @
34d81e99
#!/bin/sh
#!/bin/
ba
sh
# configure script
#
#
# Because we moved to cmake this script emulate the configure script from autotools
conf_options
=
ld_lib_pathopt
=
# Avoid depending upon Character Ranges.
as_cr_letters
=
'abcdefghijklmnopqrstuvwxyz'
...
...
@@ -99,12 +100,15 @@ enable_debug
with_metis
with_hdf5
with_libhilbert
enable_cuda_on_cpu
enable_scan_coverty
enable_test_performance
enable_test_coverage
with_parmetis
enable_se_class1
enable_se_class2
enable_se_class3
with_alpaka
with_action_on_error
with_boost
with_boost_libdir
...
...
@@ -116,7 +120,9 @@ with_lapack
with_suitesparse
with_petsc
with_eigen
with_vcdevel
enable_gpu
enable_asan
'
rm
-rf
build
...
...
@@ -124,6 +130,8 @@ if [ ! -d "build" ]; then
mkdir
build
fi
echo
"/usr/local"
>
install_dir
##### Go over all options
for
ac_option
do
...
...
@@ -219,12 +227,24 @@ do
se_class1
)
conf_options
=
"
$conf_options
-DSE_CLASS1=ON"
;;
se_class2
)
conf_options
=
"
$conf_options
-DSE_CLASS2=ON"
;;
se_class3
)
conf_options
=
"
$conf_options
-DSE_CLASS3=ON"
;;
test_coverage
)
conf_options
=
"
$conf_options
-DTEST_COVERAGE=ON"
;;
scan_coverty
)
conf_options
=
"
$conf_options
-DSCAN_COVERTY=ON"
;;
cuda_on_cpu
)
conf_options
=
"
$conf_options
-DCUDA_ON_CPU=ON"
;;
test_performance
)
conf_options
=
"
$conf_options
-DTEST_PERFORMANCE=ON"
;;
gpu
)
if
[
x
"
$CXX
"
==
x
""
]
;
then
conf_options
=
"
$conf_options
"
...
...
@@ -233,6 +253,9 @@ do
fi
conf_options
=
"
$conf_options
-DENABLE_GPU=ON"
;;
asan
)
conf_options
=
"
$conf_options
-DENABLE_ASAN=ON"
;;
*
)
ac_unrecognized_opts
=
"
$ac_unrecognized_opts$ac_unrecognized_sep
--enable-
$ac_useropt_orig
"
ac_unrecognized_sep
=
', '
;;
...
...
@@ -458,16 +481,26 @@ do
;;
suitesparse
)
conf_options
=
"
$conf_options
-DSUITESPARSE_ROOT=
$ac_optarg
"
ld_lib_pathopt
=
$ac_optarg
/lib
;;
eigen
)
conf_options
=
"
$conf_options
-DEIGEN3_ROOT=
$ac_optarg
"
;;
boost
)
conf_options
=
"
$conf_options
-DBOOST_ROOT=
$ac_optarg
"
conf_options
=
"
$conf_options
-DBOOST_ROOT=
$ac_optarg
-DBoost_NO_BOOST_CMAKE=ON"
;;
action_on_error
)
conf_options
=
"
$conf_options
-DACTION_ON_ERROR=
$ac_optarg
"
;;
mpivendor
)
conf_options
=
"
$conf_options
-DMPI_VENDOR=
$ac_optarg
"
;;
vcdevel
)
conf_options
=
"
$conf_options
-DVc_ROOT=
$ac_optarg
"
;;
alpaka
)
conf_options
=
"
$conf_options
-DALPAKA_ROOT=
$ac_optarg
"
;;
*
)
ac_unrecognized_opts
=
"
$ac_unrecognized_opts$ac_unrecognized_sep
--with-
$ac_useropt_orig
"
ac_unrecognized_sep
=
', '
;;
esac
...
...
@@ -541,7 +574,7 @@ cd build
## remove enerything
echo
"Calling cmake ../.
$conf_options
"
rm
../error_code
cmake ../.
$conf_options
DYLD_LIBRARY_PATH
=
$ld_lib_pathopt
cmake ../.
$conf_options
if
[
$?
!=
0
]
;
then
#ok something went wrong the install script analyze the return code to potentially fix the problem automatically
# Read the error code and exit with that
...
...
src/CMakeLists.txt
View file @
34d81e99
...
...
@@ -12,10 +12,10 @@ if (CUDA_ON_CPU)
endif
()
endif
()
add_executable
(
mem main.cpp memory/HeapMemory.cpp util/cudify_vars.cpp util/cudify_unit_test.cpp
${
CUDA_SOURCES
}
)
add_executable
(
mem main.cpp memory/HeapMemory.cpp util/cudify
/cudify
_vars.cpp util/cudify
/cudify
_unit_test.cpp
${
CUDA_SOURCES
}
)
add_library
(
ofpmmemory STATIC memory/HeapMemory.cpp util/cudify_vars.cpp memory/PtrMemory.cpp
${
CUDA_SOURCES
}
)
add_library
(
ofpmmemory_dl SHARED memory/HeapMemory.cpp util/cudify_vars.cpp memory/PtrMemory.cpp
${
CUDA_SOURCES
}
)
add_library
(
ofpmmemory STATIC memory/HeapMemory.cpp util/cudify
/cudify
_vars.cpp memory/PtrMemory.cpp
${
CUDA_SOURCES
}
)
add_library
(
ofpmmemory_dl SHARED memory/HeapMemory.cpp util/cudify
/cudify
_vars.cpp memory/PtrMemory.cpp
${
CUDA_SOURCES
}
)
if
(
CMAKE_COMPILER_IS_GNUCC
)
target_compile_options
(
mem PRIVATE
"-Wno-deprecated-declarations"
)
...
...
@@ -37,7 +37,9 @@ target_include_directories (mem PUBLIC ${CUDA_INCLUDE_DIRS})
target_include_directories
(
mem PUBLIC
${
CMAKE_CURRENT_SOURCE_DIR
}
)
target_include_directories
(
mem PUBLIC
${
CMAKE_CURRENT_SOURCE_DIR
}
/config
)
target_include_directories
(
mem PUBLIC
${
Boost_INCLUDE_DIRS
}
)
target_include_directories
(
mem PUBLIC
${
ALPAKA_ROOT
}
/include
)
if
(
ALPAKA_ROOT
)
target_include_directories
(
mem PUBLIC
${
ALPAKA_ROOT
}
/include
)
endif
()
if
(
CUDA_FOUND
)
target_include_directories
(
mem PUBLIC
${
CUDA_INCLUDE_DIRS
}
)
endif
()
...
...
@@ -79,26 +81,3 @@ install(FILES ptr_info.hpp
DESTINATION openfpm_devices/include
)
# Request that particles be built with -std=c++11
# As this is a public compile feature anything that links to particles
# will also build with -std=c++11
target_compile_features
(
mem PUBLIC cxx_std_11
)
#if(BUILD_TESTING)
# add_executable(particle_test test.cu)
# set_target_properties(particle_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# target_link_libraries(particle_test PRIVATE particles)
# add_test(NAME particles_10k COMMAND particle_test 10000 )
# add_test(NAME particles_256k COMMAND particle_test 256000 )
# if(APPLE)
# We need to add the default path to the driver (libcuda.dylib) as an rpath,
# so that the static cuda runtime can find it at runtime.
# set_property(TARGET particle_test PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
# endif()
#endif()
src/config/config_cmake.h.in
View file @
34d81e99
...
...
@@ -107,12 +107,28 @@ ${DEFINE_HAVE_UNISTD_H}
/* Test TinyObjLoader */
${DEFINE_HAVE_TINYOBJLOADER}
/* ACTION to take in case of error */
${DEFINE_ACTION_ON_ERROR}
/* Define to the sub-directory where libtool stores uninstalled libraries. */
#define LT_OBJDIR ".libs/"
/* NVCC compiling */
${DEFINE_NVCC} /**/
/* Define if we have Alpaka */
${DEFINE_HAVE_ALPAKA}
/* Additional alpaka definitions */
${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEF}
${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEF}
/* Name of package */
#define PACKAGE "openfpm_pdata"
...
...
@@ -140,6 +156,9 @@ ${DEFINE_PERFORMANCE_TEST}
/* Security enhancement class 1 */
${DEFINE_SE_CLASS1}
/* Security enhancement class 2 */
${DEFINE_SE_CLASS2}
/* Security enhancement class 3 */
${DEFINE_SE_CLASS3}
...
...
@@ -157,3 +176,5 @@ ${DEFINE_TEST_COVERAGE_MODE}
/* Version number of package */
#define VERSION "1.0.0"
#define OPENFPM_PDATA
src/memory/CudaMemory.cu
View file @
34d81e99
...
...
@@ -42,7 +42,8 @@ bool CudaMemory::allocate(size_t sz)
#if defined(CUDA_GPU) && !defined(CUDA_ON_CPU)
CUDA_SAFE_CALL
(
cudaMalloc
(
&
dm
,
sz
));
#else
dm
=
new
unsigned
char
[
sz
];
if
(
sz
!=
0
)
{
dm
=
new
unsigned
char
[
sz
];}
#endif
}
else
...
...
src/memory/CudaMemory.cuh
View file @
34d81e99
...
...
@@ -27,11 +27,7 @@
#ifndef CUDA_MEMORY_CUH_
#define CUDA_MEMORY_CUH_
#if __CUDACC_VER_MAJOR__ < 9
#define EXCEPT_MC
#else
#define EXCEPT_MC noexcept
#endif
#include "config.h"
#include "memory.hpp"
...
...
src/util/cuda_kernel_error_checker.hpp
View file @
34d81e99
...
...
@@ -8,6 +8,7 @@
#ifndef SE_CLASS1_CUDA_HPP_
#define SE_CLASS1_CUDA_HPP_
#include "util/se_util.hpp"
#include <type_traits>
#include <string>
...
...
src/util/cuda_launch.hpp
View file @
34d81e99
...
...
@@ -10,10 +10,6 @@
#include "cuda_kernel_error_checker.hpp"
#ifdef CUDA_ON_CPU
#include "cudify.hpp"
#endif
#if defined(CUDA_GPU) && !defined(CUDA_ON_CPU)
#if defined(SE_CLASS1) || defined(CUDA_CHECK_LAUNCH)
...
...
@@ -114,7 +110,7 @@
#else
#include "util/cudify.hpp"
#include "util/cudify
/cudify
.hpp"
#endif
...
...
src/util/cuda_util.hpp
View file @
34d81e99
...
...
@@ -29,27 +29,29 @@
#ifndef __host__
#define __host__
#define __device__
#define __shared__
#define __global__
#endif
#ifdef CUDA_ON_CPU
#define CUDA_SAFE(cuda_call) \
cuda_call;
#define CUDA_SAFE(cuda_call) \
cuda_call;
#define __shared__ static
#else
#define CUDA_SAFE(cuda_call) \
cuda_call; \
{\
cudaError_t e = cudaPeekAtLastError();\
if (e != cudaSuccess)\
#define CUDA_SAFE(cuda_call) \
cuda_call; \
{\
std::string error = cudaGetErrorString(e);\
std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
}\
}
cudaError_t e = cudaPeekAtLastError();\
if (e != cudaSuccess)\
{\
std::string error = cudaGetErrorString(e);\
std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
}\
}
#define __shared__
#endif
...
...
src/util/cudify/cudify.hpp
0 → 100644
View file @
34d81e99
#ifndef CUDIFY_HPP_
#define CUDIFY_HPP_
#ifdef CUDIFY_USE_ALPAKA
#else
#include "cudify_sequencial.hpp"
#endif
#endif
\ No newline at end of file
src/util/cudify.hpp
→
src/util/cudify
/cudify_alpaka
.hpp
View file @
34d81e99
#ifndef CUDIFY_HPP_
#define CUDIFY_HPP_
#ifndef CUDIFY_ALPAKA_HPP_
#define CUDIFY_ALPAKA_HPP_
/*! \brief This file wrap CUDA functions and some CUB and MGPU function into CPU
*
* This file use ALPAKA as underline accelerator implementation.
*
* At the moment performances make it useless with mostly any available accelerator.
*
*/
#include "cudify_hardware.hpp"
#include "cuda_util.hpp"
#include "boost/bind.hpp"
#include <type_traits>
extern
alpa_base_structs
__alpa_base__
;
...
...
@@ -35,6 +44,31 @@ static void cudaDeviceSynchronize()
alpaka
::
wait
(
*
__alpa_base__
.
queue
);
}
static
void
cudaMemcpyFromSymbol
(
void
*
dev_mem
,
const
unsigned
char
*
global_cuda_error_array
,
size_t
sz
)
{
memcpy
(
dev_mem
,
global_cuda_error_array
,
sz
);
}
/**
* CUDA memory copy types
*/
enum
cudaMemcpyKind
{
cudaMemcpyHostToHost
=
0
,
/**< Host -> Host */
cudaMemcpyHostToDevice
=
1
,
/**< Host -> Device */
cudaMemcpyDeviceToHost
=
2
,
/**< Device -> Host */
cudaMemcpyDeviceToDevice
=
3
,
/**< Device -> Device */
cudaMemcpyDefault
=
4
/**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
};
extern
int
vct_atomic_add
;
extern
int
vct_atomic_rem
;
static
void
cudaMemcpyToSymbol
(
unsigned
char
*
global_cuda_error_array
,
const
void
*
mem
,
size_t
sz
,
int
offset
,
int
unused
)
{
memcpy
(
global_cuda_error_array
+
offset
,
mem
,
sz
);
}
namespace
cub
{
template
<
typename
T
,
unsigned
int
dim
>
...
...
@@ -60,67 +94,27 @@ namespace cub
__syncthreads
();
T
cur
=
tmp
[
0
];
tmp
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
dim
;
i
++
)
if
(
threadIdx
.
x
==
0
)
{
tmp
[
i
]
=
tmp
[
i
-
1
]
+
cur
;
T
prec
=
tmp
[
0
];
tmp
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
dim
;
i
++
)
{
auto
next
=
tmp
[
i
-
1
]
+
prec
;
prec
=
tmp
[
i
];
tmp
[
i
]
=
next
;
}
}
}
};
}
namespace
mgpu
{
template
<
typename
input_it
,
typename
segments_it
,
typename
output_it
,
typename
op_t
,
typename
type_t
,
typename
context_t
>
void
segreduce
(
input_it
input
,
int
count
,
segments_it
segments
,
int
num_segments
,
output_it
output
,
op_t
op
,
type_t
init
,
context_t
&
context
)
{
for
(
int
i
=
0
;
i
<
num_segments
-
1
;
i
++
)
{
output
[
i
]
=
0
;
for
(
int
j
=
segments
[
i
]
;
j
<
segments
[
i
+
1
]
;
j
++
)
{
op
(
output
[
i
],
input
[
j
]);
}
}
}
// Key-value merge.
template
<
typename
a_keys_it
,
typename
a_vals_it
,
typename
b_keys_it
,
typename
b_vals_it
,
typename
c_keys_it
,
typename
c_vals_it
,
typename
comp_t
,
typename
context_t
>
void
merge
(
a_keys_it
a_keys
,
a_vals_it
a_vals
,
int
a_count
,
b_keys_it
b_keys
,
b_vals_it
b_vals
,
int
b_count
,
c_keys_it
c_keys
,
c_vals_it
c_vals
,
comp_t
comp
,
context_t
&
context
)
{
int
a_it
=
0
;
int
b_it
=
0
;
int
c_it
=
0
;
__syncthreads
();
while
(
a_it
<
a_count
&&
b_it
<
b_count
)
{
if
(
comp
(
a_keys
[
a_it
],
b_keys
[
b_it
]))
{
c_keys
[
c_it
]
=
a_keys
[
a_it
];
c_vals
[
c_it
]
=
a_vals
[
a_it
];
c_it
++
;
a_it
++
;
}
else
{
c_keys
[
c_it
]
=
b_keys
[
a_it
]
+
a_count
;
c_vals
[
c_it
]
=
b_vals
[
a_it
];
c_it
++
;
b_it
++
;
}
out
=
tmp
[
threadIdx
.
x
];
return
;
}
}
}
;
}
template
<
typename
T
,
typename
T2
>
static
T
atomicAdd
(
T
*
address
,
T2
val
)
{
...
...
@@ -138,6 +132,10 @@ namespace mgpu
bool
operator
()(
type_t
a
,
type_t
b
)
const
{
return
a
<
b
;
}
template
<
typename
type2_t
,
typename
type3_t
>
bool
operator
()(
type2_t
a
,
type3_t
b
)
const
{
return
a
<
b
;
}
};
/* template<typename type_t>
struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
...
...
@@ -150,6 +148,10 @@ namespace mgpu
MGPU_HOST_DEVICE
bool
operator
()(
type_t
a
,
type_t
b
)
const
{
return
a
>
b
;
}
template
<
typename
type2_t
,
typename
type3_t
>
MGPU_HOST_DEVICE
bool
operator
()(
type2_t
a
,
type3_t
b
)
const
{
return
a
>
b
;
}
};
/* template<typename type_t>
struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
...
...
@@ -209,6 +211,90 @@ namespace mgpu
};
}
namespace
mgpu
{
template
<
typename
input_it
,
typename
segments_it
,
typename
output_it
,
typename
op_t
,
typename
type_t
,
typename
context_t
>
void
segreduce
(
input_it
input
,
int
count
,
segments_it
segments
,
int
num_segments
,
output_it
output
,
op_t
op
,
type_t
init
,
context_t
&
context
)
{
int
i
=
0
;
for
(
;
i
<
num_segments
-
1
;
i
++
)
{
int
j
=
segments
[
i
];
output
[
i
]
=
input
[
j
];
++
j
;
for
(
;
j
<
segments
[
i
+
1
]
;
j
++
)
{
output
[
i
]
=
op
(
output
[
i
],
input
[
j
]);
}
}
// Last segment
int
j
=
segments
[
i
];
output
[
i
]
=
input
[
j
];
++
j
;
for
(
;
j
<
count
;
j
++
)
{
output
[
i
]
=
op
(
output
[
i
],
input
[
j
]);
}
}
// Key-value merge.
template
<
typename
a_keys_it
,
typename
a_vals_it
,
typename
b_keys_it
,
typename
b_vals_it
,
typename
c_keys_it
,
typename
c_vals_it
,
typename
comp_t
,
typename
context_t
>
void
merge
(
a_keys_it
a_keys
,
a_vals_it
a_vals
,
int
a_count
,
b_keys_it
b_keys
,
b_vals_it
b_vals
,
int
b_count
,
c_keys_it
c_keys
,
c_vals_it
c_vals
,
comp_t
comp
,
context_t
&
context
)
{
int
a_it
=
0
;
int
b_it
=
0
;
int
c_it
=
0
;
while
(
a_it
<
a_count
||
b_it
<
b_count
)
{
if
(
a_it
<
a_count
)
{
if
(
b_it
<
b_count
)
{
if
(
comp
(
b_keys
[
b_it
],
a_keys
[
a_it
]))
{
c_keys
[
c_it
]
=
b_keys
[
b_it
];
c_vals
[
c_it
]
=
b_vals
[
b_it
];
c_it
++
;
b_it
++
;
}
else
{
c_keys
[
c_it
]
=
a_keys
[
a_it
];
c_vals
[
c_it
]
=
a_vals
[
a_it
];
c_it
++
;
a_it
++
;
}
}
else
{
c_keys
[
c_it
]
=
a_keys
[
a_it
];
c_vals
[
c_it
]
=
a_vals
[
a_it
];
c_it
++
;
a_it
++
;
}
}
else
{
c_keys
[
c_it
]
=
b_keys
[
b_it
];
c_vals
[
c_it
]
=
b_vals
[
b_it
];
c_it
++
;
b_it
++
;
}
}
}
}
static
void
init_alpaka
()
{
if
(
__alpa_base__
.
initialized
==
true
)
{
return
;}
...
...
@@ -221,6 +307,100 @@ static void init_alpaka()
__alpa_base__
.
initialized
=
true
;
}
#ifdef PRINT_CUDA_LAUNCHES
#define CUDA_LAUNCH(cuda_call,ite, ...)\
{\
Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
\
gridDim.x = ite.wthr.x;\
gridDim.y = ite.wthr.y;\
gridDim.z = ite.wthr.z;\
\
blockDim.x = ite.thr.x;\
blockDim.y = ite.thr.y;\
blockDim.z = ite.thr.z;\
\
CHECK_SE_CLASS1_PRE\
\
std::cout << "Launching: " << #cuda_call << std::endl;\
\
alpaka::exec<Acc_alpa>(\
*__alpa_base__.queue,\
workDiv,\
[&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
\
auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
\
blockIdx.x = globalBlockIdx[0];\
blockIdx.y = globalBlockIdx[1];\
blockIdx.z = globalBlockIdx[2];\
\
threadIdx.x = globalThreadIdx[0];\
threadIdx.y = globalThreadIdx[1];\
threadIdx.z = globalThreadIdx[2];\
\
__alpa_base__.accKer = &acc;\
\
cuda_call(__VA_ARGS__);\
});\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}