Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mosaic/software/parallel-computing/openfpm/openfpm_devices
  • argupta/openfpm_devices
2 results
Show changes
Commits on Source (9)
......@@ -27,7 +27,7 @@ if ( CUDA_ON_BACKEND STREQUAL "HIP" AND HIP_FOUND )
hip_add_executable(mem main.cpp memory/HeapMemory.cpp util/cudify/cudify_vars.cpp memory/mem_conf.cpp ${CUDA_SOURCES} ${CUDA_SOURCES_TEST})
list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 --std=c++14)
list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 --std=c++17)
set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "-fPIC")
set_property(TARGET ofpmmemory PROPERTY NO_SONAME ON)
......@@ -172,7 +172,3 @@ install(FILES util/cudify/cuda/cudify_cuda.hpp
install(FILES util/cudify/cudify_hardware_cpu.hpp
DESTINATION openfpm_devices/include/util/cudify
COMPONENT OpenFPM)
install(FILES ptr_info.hpp
DESTINATION openfpm_devices/include
COMPONENT OpenFPM)
/*
* ptr_info.hpp
*
* Created on: Nov 19, 2015
* Author: i-bird
*/
#ifndef OPENFPM_DEVICES_SRC_PTR_INFO_HPP_
#define OPENFPM_DEVICES_SRC_PTR_INFO_HPP_
struct ptr_info
{
size_t id = 0;
size_t struct_id = 0;
size_t project_id = 0;
size_t size = 0;
size_t ref_id = 0;
};
#endif /* OPENFPM_DEVICES_SRC_PTR_INFO_HPP_ */
......@@ -61,6 +61,8 @@
#define CUDA_BACKEND_OPENMP 4
#define CUDA_BACKEND_HIP 5
#if defined(CUDIFY_USE_CUDA)
#include "cudify/cuda/cudify_cuda.hpp"
#elif defined(CUDIFY_USE_ALPAKA)
......@@ -78,6 +80,7 @@ constexpr int default_kernel_wg_threads_ = 1024;
static void init_wrappers() {}
#endif
#endif /* OPENFPM_DATA_SRC_UTIL_CUDA_UTIL_HPP_ */
......@@ -3,8 +3,13 @@
#define CUDA_ON_BACKEND CUDA_BACKEND_CUDA
#include <cuda_runtime.h>
#include <boost/preprocessor.hpp>
constexpr int default_kernel_wg_threads_ = 1024;
#ifdef DEFAULT_CUDA_THREADS
constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(DEFAULT_CUDA_THREADS);
#else
constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(1024);
#endif
#if CUDART_VERSION >= 11000 && defined(__NVCC__)
#include "cub/util_type.cuh"
......@@ -29,6 +34,118 @@ __global__ void kernel_launch_lambda_tls(lambda_f f)
}
/**
* @brief Find appropriate grid and block size based on statistics of register usage during compilation
* @note
* - According to https://developer.nvidia.com/blog/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/:
* This can greatly simplify the task of frameworks (such as Thrust), that must launch user-defined kernels. This is
* also handy for kernels that are not primary performance bottlenecks, where the programmer just wants a simple way
* to run the kernel with correct results, rather than hand-tuning the execution configuration.
*
* -For advanced kernel hand-tuning depending on compute capability, the launchbox feature of moderngpu
* (https://moderngpu.github.io/performance.html) should be considered.
*/
template<typename dim3Type, typename... Args>
void FixConfigLaunch(void (* _kernel)(Args...), dim3Type & wthr, dim3Type & thr) {
if (thr.x != 0xFFFFFFFF) {
return;
}
int blockSize = 0; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, *_kernel, 0, 0);
int dim = (wthr.x != 0) + (wthr.y != 0) + (wthr.z != 0);
if (dim == 0) {
return;
}
size_t tot_work;
unsigned int wthr_x = wthr.x;
unsigned int wthr_y = wthr.y;
unsigned int wthr_z = wthr.z;
if (dim == 1)
tot_work = wthr.x;
else if (dim == 2)
tot_work = wthr.x * wthr.y;
else if (dim == 3)
tot_work = wthr.x * wthr.y * wthr.z;
// round to the nearest bigger power of 2
size_t tot_work_2 = tot_work;
tot_work_2--;
tot_work_2 |= tot_work_2 >> 1;
tot_work_2 |= tot_work_2 >> 2;
tot_work_2 |= tot_work_2 >> 4;
tot_work_2 |= tot_work_2 >> 8;
tot_work_2 |= tot_work_2 >> 16;
tot_work_2++;
size_t n = (tot_work <= blockSize)?tot_work_2:blockSize;
if (tot_work == 0)
{
thr.x = 0;
thr.y = 0;
thr.z = 0;
wthr.x = 0;
wthr.y = 0;
wthr.z = 0;
}
thr.x = 1;
thr.y = 1;
thr.z = 1;
int dir = 0;
while (n != 1)
{
if (dir % 3 == 0)
{thr.x = thr.x << 1;}
else if (dir % 3 == 1)
{thr.y = thr.y << 1;}
else if (dir % 3 == 2)
{thr.z = thr.z << 1;}
n = n >> 1;
dir++;
dir %= dim;
}
if (dim >= 1)
{wthr.x = (wthr.x) / thr.x + (((wthr_x)%thr.x != 0)?1:0);}
if (dim >= 2)
{wthr.y = (wthr.y) / thr.y + (((wthr_y)%thr.y != 0)?1:0);}
else
{wthr.y = 1;}
if (dim >= 3)
{wthr.z = (wthr.z) / thr.z + (((wthr_z)%thr.z != 0)?1:0);}
else
{wthr.z = 1;}
// crop if wthr == 1
if (dim >= 1 && wthr.x == 1)
{thr.x = wthr_x;}
if (dim >= 2 && wthr.y == 1)
{thr.y = wthr_y;}
if (dim == 3 && wthr.z == 1)
{thr.z = wthr_z;}
}
#endif
static void init_wrappers()
......@@ -215,39 +332,92 @@ static void init_wrappers()
#else
#define CUDA_LAUNCH(cuda_call,ite, ...) \
if (ite.wthr.x != 0)\
{cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);}
template<typename... Args, typename ite_type>
void CUDA_LAUNCH(void (* _kernel)(Args...),ite_type ite,Args... args)
{
// std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << ite.wthr.x << " " << ite.wthr.y << " " << ite.wthr.z << "/" << ite.thr.x << " " << ite.thr.y << " " << ite.thr.z << std::endl;
#define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
cuda_call<<<wthr,thr>>>(__VA_ARGS__);
#ifdef __NVCC__
FixConfigLaunch(_kernel,ite.wthr,ite.thr);
_kernel<<<ite.wthr,ite.thr>>>(args...);
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH not implemented for this compiler" << std::endl;
#endif
}
#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...) \
kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
template<typename... Args>
void CUDA_LAUNCH_DIM3(void (* _kernel)(Args...),dim3 wthr, dim3 thr,Args... args)
{
// std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << wthr.x << " " << wthr.y << " " << wthr.z << "/" << thr.x << " " << thr.y << " " << thr.z << std::endl;
#define CUDA_CHECK()
#ifdef __NVCC__
FixConfigLaunch(_kernel,wthr,thr);
_kernel<<<wthr,thr>>>(args...);
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_DIM3 not implemented for this compiler" << std::endl;
#endif
}
#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
{\
if (ite.wthr.x != 0)\
{kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
}
template<typename lambda_type, typename ite_type, typename... Args>
void CUDA_LAUNCH_LAMBDA(ite_type ite, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,ite.wthr,ite.thr);
kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA not implemented for this compiler" << std::endl;
#endif
}
#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
if (ite.wthr.x != 0)\
{kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}\
}
static void CUDA_CHECK() {}
#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
if (ite.wthr.x != 0)\
{kernel_launch_lambda_tls<<<wthr__,thr__>>>(lambda_f);}\
}
template<typename lambda_type, typename ite_type, typename... Args>
void CUDA_LAUNCH_LAMBDA_TLS(ite_type ite, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,ite.wthr,ite.thr);
if (ite.wthr.x != 0)
{kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
#endif
}
template<typename lambda_type, typename... Args>
void CUDA_LAUNCH_LAMBDA_DIM3(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,wthr_,thr_);
dim3 wthr__(wthr_);
dim3 thr__(thr_);
if (wthr__.x != 0)
{kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
#endif
}
template<typename lambda_type, typename... Args>
void CUDA_LAUNCH_LAMBDA_DIM3_TLS(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
{
#ifdef __NVCC__
void (* _ker)(lambda_type) = kernel_launch_lambda;
FixConfigLaunch(_ker,wthr_,thr_);
dim3 wthr__(wthr_);
dim3 thr__(thr_);
if (wthr__.x != 0)
{kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
#else
std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
#endif
}
#endif
......
......@@ -2,10 +2,6 @@
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include <type_traits>
std::is_trivially_copyable<int> b;
#include "util/cuda_util.hpp"
#include "memory/CudaMemory.cuh"
......
......@@ -818,6 +818,7 @@ static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
#else
#define CUDA_LAUNCH(cuda_call,ite, ...) \
{\
gridDim.x = ite.wthr.x;\
......