#ifndef CUDIFY_HIP_HPP_
#define CUDIFY_HIP_HPP_

#include "config.h"

#define CUDA_ON_BACKEND CUDA_BACKEND_HIP

#ifdef __NVCC__
    #undef __NVCC__
    #include <hip/hip_runtime.h>
    #define __NVCC__
#else
    #include <hip/hip_runtime.h>
#endif

constexpr int default_kernel_wg_threads_ = 256;

typedef hipError_t cudaError_t;
typedef hipStream_t cudaStream_t;
typedef hipDeviceProp_t cudaDeviceProp_t;
typedef cudaDeviceProp_t cudaDeviceProp;
typedef hipEvent_t cudaEvent_t;
typedef hipFuncAttributes cudaFuncAttributes;


#define cudaSuccess hipSuccess


static void init_wrappers()
{}

/**
 * CUDA memory copy types
 */
enum  cudaMemcpyKind
{
    cudaMemcpyHostToHost          =   0,      /**< Host   -> Host */
    cudaMemcpyHostToDevice        =   1,      /**< Host   -> Device */
    cudaMemcpyDeviceToHost        =   2,      /**< Device -> Host */
    cudaMemcpyDeviceToDevice      =   3,      /**< Device -> Device */
    cudaMemcpyDefault             =   4       /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
};

static cudaError_t cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,cudaMemcpyKind opt)
{
    hipMemcpyKind opt_;

    switch (opt)
    {
        case cudaMemcpyHostToHost:
            opt_ = hipMemcpyHostToHost;
            break;

        case cudaMemcpyHostToDevice:
            opt_ = hipMemcpyHostToDevice;
            break;

        case cudaMemcpyDeviceToHost:
            opt_ = hipMemcpyDeviceToHost;
            break;

        case cudaMemcpyDeviceToDevice:
            opt_ = hipMemcpyDeviceToDevice;
            break;

        default:
            opt_ = hipMemcpyDefault;
            break;
    }

    return hipMemcpyToSymbol(global_cuda_error_array,mem,sz,offset,opt_);
}

static cudaError_t cudaDeviceSynchronize()
{
    return hipDeviceSynchronize();
}

static cudaError_t cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
{
    return hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sz);
}

static const char* cudaGetErrorString ( cudaError_t error )
{
    return hipGetErrorString(error);
}

static cudaError_t cudaGetDevice ( int* device )
{
    return hipGetDevice(device);
}

static cudaError_t cudaSetDevice ( int  device )
{
    return hipSetDevice(device);
}

static cudaError_t cudaMemGetInfo ( size_t* free, size_t* total )
{
    return hipMemGetInfo(free,total);
}

static cudaError_t cudaFuncGetAttributes ( cudaFuncAttributes* attr, const void* func )
{
    return hipFuncGetAttributes(attr,func);
}

static cudaError_t cudaGetDeviceProperties ( cudaDeviceProp* prop, int  device )
{
    return hipGetDeviceProperties(prop,device);
}

static cudaError_t cudaEventCreate ( cudaEvent_t* event )
{
    return hipEventCreate(event);
}

static cudaError_t cudaEventDestroy ( cudaEvent_t event )
{
    return hipEventDestroy(event);
}

static cudaError_t cudaMalloc ( void** devPtr, size_t size )
{
    return hipMalloc(devPtr,size);
}

static cudaError_t cudaMallocHost ( void** ptr, size_t size )
{
    return hipHostMalloc(ptr,size);
}

static cudaError_t cudaFree ( void* devPtr )
{
    return hipFree(devPtr);
}

static cudaError_t cudaFreeHost ( void* ptr )
{
    return hipHostFree(ptr);
}

static cudaError_t cudaStreamSynchronize ( cudaStream_t stream )
{
    return hipStreamSynchronize(stream);
}

static cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
{
    return hipEventRecord(event,stream);
}

static cudaError_t cudaEventSynchronize ( cudaEvent_t event )
{
    return hipEventSynchronize(event);
}

static cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
{
    return hipEventElapsedTime(ms,start,end);
}

static cudaError_t cudaGetDeviceCount ( int* count )
{
    return hipGetDeviceCount(count);
}

static cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind opt )
{
    hipMemcpyKind opt_;

    switch (opt)
    {
        case cudaMemcpyHostToHost:
            opt_ = hipMemcpyHostToHost;
            break;

        case cudaMemcpyHostToDevice:
            opt_ = hipMemcpyHostToDevice;
            break;

        case cudaMemcpyDeviceToHost:
            opt_ = hipMemcpyDeviceToHost;
            break;

        case cudaMemcpyDeviceToDevice:
            opt_ = hipMemcpyDeviceToDevice;
            break;

        default:
            opt_ = hipMemcpyDefault;
            break;
    }

    return hipMemcpy(dst,src,count,opt_);
}

#ifdef __HIPCC__

#include "cudify_hardware_common.hpp"
#include "util/cuda_util.hpp"
#include <vector>
#include <string.h>
#include "hipcub/hipcub.hpp"
#include "hipcub/block/block_scan.hpp"

template<typename lambda_f>
__global__ void kernel_launch_lambda(lambda_f f)
{
    dim3 bid = blockIdx;
    dim3 tid = threadIdx;
    f(bid,tid);
}

template<typename lambda_f>
__global__ void kernel_launch_lambda_tls(lambda_f f)
{
    f();
}

namespace cub
{
    template<typename T, unsigned int bd>
    using BlockScan = hipcub::BlockScan<T,bd>;
}

template<typename T>
struct has_work_gpu_cl_lin_blocks_
{
    static unsigned int lin(const T & b)
    {
        return b.x * b.y * b.z;
    }
};

template<>
struct has_work_gpu_cl_lin_blocks_<unsigned int>
{
    static unsigned int lin(const unsigned int & b)
    {
        return b;
    }
};

template<>
struct has_work_gpu_cl_lin_blocks_<unsigned long>
{
    static unsigned int lin(const unsigned long & b)
    {
        return b;
    }
};

template<>
struct has_work_gpu_cl_lin_blocks_<int>
{
    static unsigned int lin(const int & b)
    {
        return b;
    }
};

template<typename wthr_type, typename thr_type>
bool has_work_gpu_cl_(const wthr_type & wthr, const thr_type & thr)
{
    return (has_work_gpu_cl_lin_blocks_<typename std::remove_const<wthr_type>::type>::lin(wthr) * 
            has_work_gpu_cl_lin_blocks_<typename std::remove_const<thr_type>::type>::lin(thr)) != 0;
}

#ifdef PRINT_CUDA_LAUNCHES

#define CUDA_LAUNCH(cuda_call,ite, ...)\
        \
        CHECK_SE_CLASS1_PRE\
        \
        std::cout << "Launching: " << #cuda_call << std::endl;\
        \
        hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
        \
        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
        }


#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
        {\
        dim3 wthr__(wthr_);\
        dim3 thr__(thr_);\
        \
        ite_gpu<1> itg;\
        itg.wthr = wthr;\
        itg.thr = thr;\
        \
        CHECK_SE_CLASS1_PRE\
        std::cout << "Launching: " << #cuda_call << std::endl;\
        \
        hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);\
        \
        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
        }

#define CUDA_CHECK()

#else

#define CUDA_LAUNCH(cuda_call,ite, ...) \
        \
        {\
        CHECK_SE_CLASS1_PRE\
        \
        if (has_work_gpu_cl_(ite.wthr,ite.thr)  == true)\
        {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call), dim3(ite.wthr), dim3(ite.thr), 0, 0, __VA_ARGS__);}\
        \
        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
        }


#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
        {\
        \
        CHECK_SE_CLASS1_PRE\
        \
        if (has_work_gpu_cl_(wthr_,thr_) == true)\
        {hipLaunchKernelGGL(HIP_KERNEL_NAME(cuda_call),wthr_,thr_, 0, 0, __VA_ARGS__);}\
        \
        CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
        }

#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...)\
        {\
        \
        CHECK_SE_CLASS1_PRE\
        \
        if (has_work_gpu_cl_(ite.wthr,ite.thr) == true)\
        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),ite.wthr,ite.thr, 0, 0, lambda_f);}\
        \
        CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
        }

#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
        {\
        CHECK_SE_CLASS1_PRE\
        if (ite.wthr.x != 0)\
        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),ite.wthr,ite.thr,0,0,lambda_f);}\
	CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
        }

#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
        {\
        dim3 wthr__(wthr_);\
        dim3 thr__(thr_);\
	CHECK_SE_CLASS1_PRE\
        if (wthr__.x != 0)\
        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda_tls),wthr_,thr_, 0, 0, lambda_f);}\
	CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
        }

#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
        {\
        dim3 wthr__(wthr_);\
        dim3 thr__(thr_);\
        CHECK_SE_CLASS1_PRE\
        if (wthr__.x != 0)\
        {hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_launch_lambda),wthr_,thr_, 0, 0, lambda_f);}\
        CHECK_SE_CLASS1_POST("kernel_launch_lambda",__VA_ARGS__)\
        }

#define CUDA_CHECK()

#endif

#endif


#endif