Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mosaic/software/parallel-computing/openfpm/openfpm_devices
  • argupta/openfpm_devices
2 results
Show changes
Commits on Source (2)
......@@ -61,6 +61,8 @@
#define CUDA_BACKEND_OPENMP 4
#define CUDA_BACKEND_HIP 5
#if defined(CUDIFY_USE_CUDA)
#include "cudify/cuda/cudify_cuda.hpp"
#elif defined(CUDIFY_USE_ALPAKA)
......@@ -78,6 +80,7 @@ constexpr int default_kernel_wg_threads_ = 1024;
static void init_wrappers() {}
#endif
#endif /* OPENFPM_DATA_SRC_UTIL_CUDA_UTIL_HPP_ */
......@@ -3,8 +3,13 @@
#define CUDA_ON_BACKEND CUDA_BACKEND_CUDA
#include <cuda_runtime.h>
#include <boost/preprocessor.hpp>
constexpr int default_kernel_wg_threads_ = 1024;
#ifdef DEFAULT_CUDA_THREADS
constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(DEFAULT_CUDA_THREADS);
#else
constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(1024);
#endif
#if CUDART_VERSION >= 11000 && defined(__NVCC__)
#include "cub/util_type.cuh"
......@@ -29,6 +34,118 @@ __global__ void kernel_launch_lambda_tls(lambda_f f)
}
/**
* @brief Find appropriate grid and block size based on statistics of register usage during compilation
* @note
* - According to https://developer.nvidia.com/blog/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/:
* This can greatly simplify the task of frameworks (such as Thrust), that must launch user-defined kernels. This is
* also handy for kernels that are not primary performance bottlenecks, where the programmer just wants a simple way
* to run the kernel with correct results, rather than hand-tuning the execution configuration.
*
* -For advanced kernel hand-tuning depending on compute capability, the launchbox feature of moderngpu
* (https://moderngpu.github.io/performance.html) should be considered.
*/
template<typename dim3Type, typename... Args>
void FixConfigLaunch(void (* _kernel)(Args...), dim3Type & wthr, dim3Type & thr) {
if (thr.x != 0xFFFFFFFF) {
return;
}
int blockSize = 0; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, *_kernel, 0, 0);
int dim = (wthr.x != 0) + (wthr.y != 0) + (wthr.z != 0);
if (dim == 0) {
return;
}
size_t tot_work;
unsigned int wthr_x = wthr.x;
unsigned int wthr_y = wthr.y;
unsigned int wthr_z = wthr.z;
if (dim == 1)
tot_work = wthr.x;
else if (dim == 2)
tot_work = wthr.x * wthr.y;
else if (dim == 3)
tot_work = wthr.x * wthr.y * wthr.z;
// round to the nearest bigger power of 2
size_t tot_work_2 = tot_work;
tot_work_2--;
tot_work_2 |= tot_work_2 >> 1;
tot_work_2 |= tot_work_2 >> 2;
tot_work_2 |= tot_work_2 >> 4;
tot_work_2 |= tot_work_2 >> 8;
tot_work_2 |= tot_work_2 >> 16;
tot_work_2++;
size_t n = (tot_work <= blockSize)?tot_work_2:blockSize;
if (tot_work == 0)
{
thr.x = 0;
thr.y = 0;
thr.z = 0;
wthr.x = 0;
wthr.y = 0;
wthr.z = 0;
}
thr.x = 1;
thr.y = 1;
thr.z = 1;
int dir = 0;
while (n != 1)
{
if (dir % 3 == 0)
{thr.x = thr.x << 1;}
else if (dir % 3 == 1)
{thr.y = thr.y << 1;}
else if (dir % 3 == 2)
{thr.z = thr.z << 1;}
n = n >> 1;
dir++;
dir %= dim;
}
if (dim >= 1)
{wthr.x = (wthr.x) / thr.x + (((wthr_x)%thr.x != 0)?1:0);}
if (dim >= 2)
{wthr.y = (wthr.y) / thr.y + (((wthr_y)%thr.y != 0)?1:0);}
else
{wthr.y = 1;}
if (dim >= 3)
{wthr.z = (wthr.z) / thr.z + (((wthr_z)%thr.z != 0)?1:0);}
else
{wthr.z = 1;}
// crop if wthr == 1
if (dim >= 1 && wthr.x == 1)
{thr.x = wthr_x;}
if (dim >= 2 && wthr.y == 1)
{thr.y = wthr_y;}
if (dim == 3 && wthr.z == 1)
{thr.z = wthr_z;}
}
#endif
static void init_wrappers()
......@@ -215,39 +332,94 @@ static void init_wrappers()
#else
#define CUDA_LAUNCH(cuda_call,ite, ...) \
if (ite.wthr.x != 0)\
{cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);}
template<typename... Args, typename ite_type>
void CUDA_LAUNCH(void (* _kernel)(Args...),ite_type ite,Args... args)
{
FixConfigLaunch(_kernel,ite.wthr,ite.thr);
#define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
cuda_call<<<wthr,thr>>>(__VA_ARGS__);
// std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << ite.wthr.x << " " << ite.wthr.y << " " << ite.wthr.z << "/" << ite.thr.x << " " << ite.thr.y << " " << ite.thr.z << std::endl;
#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...) \
kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
#ifdef __NVCC__
_kernel<<<ite.wthr,ite.thr>>>(args...);
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH not implemented for this compiler" << std::endl;
#endif
}
#define CUDA_CHECK()
template<typename... Args>
void CUDA_LAUNCH_DIM3(void (* _kernel)(Args...),dim3 wthr, dim3 thr,Args... args)
{
FixConfigLaunch(_kernel,wthr,thr);
#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
{\
if (ite.wthr.x != 0)\
{kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
}
// std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << wthr.x << " " << wthr.y << " " << wthr.z << "/" << thr.x << " " << thr.y << " " << thr.z << std::endl;
#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
if (ite.wthr.x != 0)\
{kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}\
}
#ifdef __NVCC__
_kernel<<<wthr,thr>>>(args...);
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_DIM3 not implemented for this compiler" << std::endl;
#endif
}
#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
if (ite.wthr.x != 0)\
{kernel_launch_lambda_tls<<<wthr__,thr__>>>(lambda_f);}\
}
template<typename lambda_type, typename ite_type, typename... Args>
void CUDA_LAUNCH_LAMBDA(ite_type ite, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,ite.wthr,ite.thr);
kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA not implemented for this compiler" << std::endl;
#endif
}
static void CUDA_CHECK() {}
template<typename lambda_type, typename ite_type, typename... Args>
void CUDA_LAUNCH_LAMBDA_TLS(ite_type ite, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,ite.wthr,ite.thr);
if (ite.wthr.x != 0)
{kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
#endif
}
template<typename lambda_type, typename... Args>
void CUDA_LAUNCH_LAMBDA_DIM3(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,wthr_,thr_);
dim3 wthr__(wthr_);
dim3 thr__(thr_);
if (wthr__.x != 0)
{kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
#endif
}
template<typename lambda_type, typename... Args>
void CUDA_LAUNCH_LAMBDA_DIM3_TLS(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,wthr_,thr_);
dim3 wthr__(wthr_);
dim3 thr__(thr_);
if (wthr__.x != 0)
{kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
#endif
}
#endif
......
......@@ -818,6 +818,7 @@ static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
#else
#define CUDA_LAUNCH(cuda_call,ite, ...) \
{\
gridDim.x = ite.wthr.x;\
......