Adding missing files

f831b228 · Pietro Incardona · 0073fde6 · f831b228
Commit f831b228 authored 3 years ago by Pietro Incardona
--- a/src/util/cudify/cudify_hip.hpp
+++ b/src/util/cudify/cudify_hip.hpp
+#ifndef CUDIFY_HIP_HPP_
+#define CUDIFY_HIP_HPP_
+
+#include "config.h"
+
+#ifdef HIP_GPU
+
+#include "cudify_hardware_common.hpp"
+
+#define CUDIFY_ACTIVE
+
+#ifdef __NVCC__
+    #undef __NVCC__
+    #include <hip/hip_runtime.h>
+    #define __NVCC__
+#else
+    #include <hip/hip_runtime.h>
+#endif
+#include "util/cuda_util.hpp"
+#include <vector>
+#include <string.h>
+#include "hipcub/hipcub.hpp"
+#include "hipcub/block/block_scan.hpp"
+
+constexpr int default_kernel_wg_threads_ = 256;
+
+typedef hipError_t cudaError_t;
+typedef hipStream_t cudaStream_t;
+typedef hipDeviceProp_t cudaDeviceProp_t;
+typedef cudaDeviceProp_t cudaDeviceProp;
+typedef hipEvent_t cudaEvent_t;
+typedef hipFuncAttributes cudaFuncAttributes;
+
+#define cudaSuccess hipSuccess
+
+namespace cub
+{
+    template<typename T, unsigned int bd>
+    using BlockScan = hipcub::BlockScan<T,bd>;
+}
+
+static void init_wrappers()
+{}
+
+/**
+ * CUDA memory copy types
+ */
+enum  cudaMemcpyKind
+{
+    cudaMemcpyHostToHost          =   0,      /**< Host   -> Host */
+    cudaMemcpyHostToDevice        =   1,      /**< Host   -> Device */
+    cudaMemcpyDeviceToHost        =   2,      /**< Device -> Host */
+    cudaMemcpyDeviceToDevice      =   3,      /**< Device -> Device */
+    cudaMemcpyDefault             =   4       /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
+};
+
+static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)
+{
+    hipMemcpyKind opt_;
+
+    switch (opt)
+    {
+        case cudaMemcpyHostToHost:
+            opt_ = hipMemcpyHostToHost;
+            break;
+
+        case cudaMemcpyHostToDevice:
+            opt_ = hipMemcpyHostToDevice;
+            break;
+
+        case cudaMemcpyDeviceToHost:
+            opt_ = hipMemcpyDeviceToHost;
+            break;
+
+        case cudaMemcpyDeviceToDevice:
+            opt_ = hipMemcpyDeviceToDevice;
+            break;
+
+        default:
+            opt_ = hipMemcpyDefault;
+            break;
+    }
+
+    return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
+}
+
+static cudaError_t cudaDeviceSynchronize()
+{
+    return hipDeviceSynchronize();
+}
+
+static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
+{
+    return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
+}
+
+static const char* cudaGetErrorString ( cudaError_t error )
+{
+    return hipGetErrorString(error);
+}
+
+static cudaError_t cudaGetDevice ( int* device )
+{
+    return hipGetDevice(device);
+}
+
+static cudaError_t cudaSetDevice ( int  device )
+{
+    return hipSetDevice(device);
+}
+
+static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )
+{
+    return hipMemGetInfo(free,total);
+}
+
+static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )
+{
+    return hipFuncGetAttributes(attr,func);
+}
+
+static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int  device )
+{
+    return hipGetDeviceProperties(prop,device);
+}
+
+static cudaError_t cudaEventCreate ( cudaEvent_t* event )
+{
+    return hipEventCreate(event);
+}
+
+static cudaError_t cudaEventDestroy ( cudaEvent_t event )
+{
+    return hipEventDestroy(event);
+}
+
+static cudaError_t cudaMalloc ( void** devPtr, size_t size )
+{
+    return hipMalloc(devPtr,size);
+}
+
+static cudaError_t cudaMallocHost ( void** ptr, size_t size )
+{
+    return hipHostMalloc(ptr,size);
+}
+
+static cudaError_t cudaFree ( void* devPtr )
+{
+    return hipFree(devPtr);
+}
+
+static cudaError_t cudaFreeHost ( void* ptr )
+{
+    return hipHostFree(ptr);
+}
+
+static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
+{
+    return hipStreamSynchronize(stream);
+}
+
+static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
+{
+    return hipEventRecord(event,stream);
+}
+
+static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
+{
+    return hipEventSynchronize(event);
+}
+
+static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
+{
+    return hipEventElapsedTime(ms,start,end);
+}
+
+static cudaError_t cudaGetDeviceCount ( int* count )
+{
+    return hipGetDeviceCount(count);
+}
+
+static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )
+{
+    hipMemcpyKind opt_;
+
+    switch (opt)
+    {
+        case cudaMemcpyHostToHost:
+            opt_ = hipMemcpyHostToHost;
+            break;
+
+        case cudaMemcpyHostToDevice:
+            opt_ = hipMemcpyHostToDevice;
+            break;
+
+        case cudaMemcpyDeviceToHost:
+            opt_ = hipMemcpyDeviceToHost;
+            break;
+
+        case cudaMemcpyDeviceToDevice:
+            opt_ = hipMemcpyDeviceToDevice;
+            break;
+
+        default:
+            opt_ = hipMemcpyDefault;
+            break;
+    }
+
+    return hipMemcpy(dst,src,count,opt_);
+}
+
+template<typename T>
+struct has_work_gpu_cl_lin_blocks_
+{
+    static unsigned int lin(const T & b)
+    {
+        return b.x * b.y * b.z;
+    }
+};
+
+template<>
+struct has_work_gpu_cl_lin_blocks_<unsigned int>
+{
+    static unsigned int lin(const unsigned int & b)
+    {
+        return b;
+    }
+};
+
+template<>
+struct has_work_gpu_cl_lin_blocks_<unsigned long>
+{
+    static unsigned int lin(const unsigned long & b)
+    {
+        return b;
+    }
+};
+
+template<>
+struct has_work_gpu_cl_lin_blocks_<int>
+{
+    static unsigned int lin(const int & b)
+    {
+        return b;
+    }
+};
+
+template<typename wthr_type, typename thr_type>
+bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)
+{
+    return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) * 
+            has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
+}
+
+#ifdef PRINT_CUDA_LAUNCHES
+
+#define CUDA_LAUNCH(cuda_call,ite, ...)\
+        \
+        CHECK_SE_CLASS1_PRE\
+        \
+        std::cout << "Launching: " << #cuda_call << std::endl;\
+        \
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
+        \
+        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
+        }
+
+
+#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
+        {\
+        dim3 wthr__(wthr_);\
+        dim3 thr__(thr_);\
+        \
+        ite_gpu<1> itg;\
+        itg.wthr = wthr;\
+        itg.thr = thr;\
+        \
+        CHECK_SE_CLASS1_PRE\
+        std::cout << "Launching: " << #cuda_call << std::endl;\
+        \
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
+        \
+        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
+        }
+
+#define CUDA_CHECK()
+
+#else
+
+#define CUDA_LAUNCH(cuda_call,ite, ...) \
+        \
+        {\
+        CHECK_SE_CLASS1_PRE\
+        \
+        if (has_work_gpu_cl_(ite.wthr,ite.thr)  == true)\
+        {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
+        \
+        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
+        }
+
+
+#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
+        {\
+        \
+        CHECK_SE_CLASS1_PRE\
+        \
+        if (has_work_gpu_cl_(wthr_,thr_) == true)\
+        {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
+        \
+        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
+        }
+
+#define CUDA_CHECK()
+
+#endif
+
+#endif
+
+
+#endif