cudify_hip.hpp 8.77 KiB
#ifndef CUDIFY_HIP_HPP_
#define CUDIFY_HIP_HPP_
#include "config.h"
#define CUDA_ON_BACKEND CUDA_BACKEND_HIP
#ifdef __NVCC__
#undef __NVCC__
#include <hip/hip_runtime.h>
#define __NVCC__
#else
#include <hip/hip_runtime.h>
#endif
constexpr int default_kernel_wg_threads_ = 256;
typedef hipError_t cudaError_t;
typedef hipStream_t cudaStream_t;
typedef hipDeviceProp_t cudaDeviceProp_t;
typedef cudaDeviceProp_t cudaDeviceProp;
typedef hipEvent_t cudaEvent_t;
typedef hipFuncAttributes cudaFuncAttributes;
#define cudaSuccess hipSuccess
static void init_wrappers()
{}
/**
* CUDA memory copy types
*/
enum cudaMemcpyKind
{
cudaMemcpyHostToHost = 0, /**< Host -> Host */
cudaMemcpyHostToDevice = 1, /**< Host -> Device */
cudaMemcpyDeviceToHost = 2, /**< Device -> Host */
cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */
cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
};
static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)
{
hipMemcpyKind opt_;
switch (opt)
{
case cudaMemcpyHostToHost:
opt_ = hipMemcpyHostToHost;
break;
case cudaMemcpyHostToDevice:
opt_ = hipMemcpyHostToDevice;
break;
case cudaMemcpyDeviceToHost:
opt_ = hipMemcpyDeviceToHost;
break;
case cudaMemcpyDeviceToDevice:
opt_ = hipMemcpyDeviceToDevice;
break;
default:
opt_ = hipMemcpyDefault;
break;
}
return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
}
static cudaError_t cudaDeviceSynchronize()
{
return hipDeviceSynchronize();
}
static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
{
return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
}
static const char* cudaGetErrorString ( cudaError_t error )
{
return hipGetErrorString(error);
}
static cudaError_t cudaGetDevice ( int* device )
{
return hipGetDevice(device);
}
static cudaError_t cudaSetDevice ( int device )
{
return hipSetDevice(device);
}
static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )
{
return hipMemGetInfo(free,total);
}
static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )
{
return hipFuncGetAttributes(attr,func);
}
static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int device )
{
return hipGetDeviceProperties(prop,device);
}
static cudaError_t cudaEventCreate ( cudaEvent_t* event )
{
return hipEventCreate(event);
}
static cudaError_t cudaEventDestroy ( cudaEvent_t event )
{
return hipEventDestroy(event);
}
static cudaError_t cudaMalloc ( void** devPtr, size_t size )
{
return hipMalloc(devPtr,size);
}
static cudaError_t cudaMallocHost ( void** ptr, size_t size )
{
return hipHostMalloc(ptr,size);
}
static cudaError_t cudaFree ( void* devPtr )
{
return hipFree(devPtr);
}
static cudaError_t cudaFreeHost ( void* ptr )
{
return hipHostFree(ptr);
}
static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
{
return hipStreamSynchronize(stream);
}
static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
{
return hipEventRecord(event,stream);
}
static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
{
return hipEventSynchronize(event);
}
static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
{
return hipEventElapsedTime(ms,start,end);
}
static cudaError_t cudaGetDeviceCount ( int* count )
{
return hipGetDeviceCount(count);
}
static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )
{
hipMemcpyKind opt_;
switch (opt)
{
case cudaMemcpyHostToHost:
opt_ = hipMemcpyHostToHost;
break;
case cudaMemcpyHostToDevice:
opt_ = hipMemcpyHostToDevice;
break;
case cudaMemcpyDeviceToHost:
opt_ = hipMemcpyDeviceToHost;
break;
case cudaMemcpyDeviceToDevice:
opt_ = hipMemcpyDeviceToDevice;
break;
default:
opt_ = hipMemcpyDefault;
break;
}
return hipMemcpy(dst,src,count,opt_);
}
#ifdef __HIPCC__
#include "cudify_hardware_common.hpp"
#include "util/cuda_util.hpp"
#include <vector>
#include <string.h>
#include "hipcub/hipcub.hpp"
#include "hipcub/block/block_scan.hpp"
template<typename lambda_f>
__global__ void kernel_launch_lambda(lambda_f f)
{
dim3 bid = blockIdx;
dim3 tid = threadIdx;
f(bid,tid);
}
template<typename lambda_f>
__global__ void kernel_launch_lambda_tls(lambda_f f)
{
f();
}
namespace cub
{
template<typename T, unsigned int bd>
using BlockScan = hipcub::BlockScan<T,bd>;
}
template<typename T>
struct has_work_gpu_cl_lin_blocks_
{
static unsigned int lin(const T & b)
{
return b.x * b.y * b.z;
}
};
template<>
struct has_work_gpu_cl_lin_blocks_<unsigned int>
{
static unsigned int lin(const unsigned int & b)
{
return b;
}
};
template<>
struct has_work_gpu_cl_lin_blocks_<unsigned long>
{
static unsigned int lin(const unsigned long & b)
{
return b;
}
};
template<>
struct has_work_gpu_cl_lin_blocks_<int>
{
static unsigned int lin(const int & b)
{
return b;
}
};
template<typename wthr_type, typename thr_type>
bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)
{
return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) *
has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
}
#ifdef PRINT_CUDA_LAUNCHES
#define CUDA_LAUNCH(cuda_call,ite, ...)\
\
CHECK_SE_CLASS1_PRE\
\
std::cout << "Launching: " << #cuda_call << std::endl;\
\
hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}
#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
\
ite_gpu<1> itg;\
itg.wthr = wthr;\
itg.thr = thr;\
\
CHECK_SE_CLASS1_PRE\
std::cout << "Launching: " << #cuda_call << std::endl;\
\
hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}
#define CUDA_CHECK()
#else
#define CUDA_LAUNCH(cuda_call,ite, ...) \
\
{\
CHECK_SE_CLASS1_PRE\
\
if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
{hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}
#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
{\
\
CHECK_SE_CLASS1_PRE\
\
if (has_work_gpu_cl_(wthr_,thr_) == true)\
{hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}
#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
{\
\
CHECK_SE_CLASS1_PRE\
\
if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
{hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
\
CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
}
#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
{\
CHECK_SE_CLASS1_PRE\
if (ite.wthr.x != 0)\
{hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
}
#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
CHECK_SE_CLASS1_PRE\
if (wthr__.x != 0)\
{hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
}
#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
CHECK_SE_CLASS1_PRE\
if (wthr__.x != 0)\
{hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
}
#define CUDA_CHECK()
#endif
#endif
#endif