Skip to content
Snippets Groups Projects
Commit 6d8f28cc authored by Pietro Incardona's avatar Pietro Incardona
Browse files

Fixing DEBUG_SE1

parent 25c42428
No related branches found
No related tags found
No related merge requests found
Pipeline #4077 passed
......@@ -2,6 +2,7 @@
#include <cstddef>
#include "CudaMemory.cuh"
#include "cuda_macro.h"
#include "util/cudify/cudify.hpp"
#include <cstring>
#define CUDA_EVENT 0x1201
......@@ -19,10 +20,8 @@ bool CudaMemory::flush()
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(dm,hm,sz,hipMemcpyHostToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(dm,hm,sz,cudaMemcpyHostToDevice));
#else
memcpy(dm,hm,sz);
CUDA_SAFE_CALL(cudaMemcpy(dm,hm,sz,cudaMemcpyHostToDevice));
#endif
}
......@@ -121,10 +120,8 @@ void CudaMemory::deviceToDevice(void * ptr, size_t start, size_t stop, size_t of
{
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(((unsigned char *)dm)+offset,((unsigned char *)ptr)+start,(stop-start),hipMemcpyDeviceToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(((unsigned char *)dm)+offset,((unsigned char *)ptr)+start,(stop-start),cudaMemcpyDeviceToDevice));
#else
memcpy(((unsigned char *)dm)+offset,((unsigned char *)ptr)+start,(stop-start));
CUDA_SAFE_CALL(cudaMemcpy(((unsigned char *)dm)+offset,((unsigned char *)ptr)+start,(stop-start),cudaMemcpyDeviceToDevice));
#endif
}
......@@ -170,12 +167,10 @@ bool CudaMemory::copyFromPointer(const void * ptr)
CUDA_SAFE_CALL(hipHostGetDevicePointer(&dvp,hm,0));
// memory copy
memcpy(dvp,ptr,sz);
#elif defined(CUDIFY_USE_CUDA)
#else
CUDA_SAFE_CALL(cudaHostGetDevicePointer(&dvp,hm,0));
// memory copy
memcpy(dvp,ptr,sz);
#else
memcpy(hm,ptr,sz);
#endif
return true;
......@@ -202,10 +197,8 @@ bool CudaMemory::copyDeviceToDevice(const CudaMemory & m)
//! Copy the memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(dm,m.dm,m.sz,hipMemcpyDeviceToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(dm,m.dm,m.sz,cudaMemcpyDeviceToDevice));
#else
memcpy(dm,m.dm,m.sz);
CUDA_SAFE_CALL(cudaMemcpy(dm,m.dm,m.sz,cudaMemcpyDeviceToDevice));
#endif
return true;
......@@ -307,10 +300,8 @@ bool CudaMemory::resize(size_t sz)
//! copy from the old buffer to the new one
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(tdm,dm,CudaMemory::size(),hipMemcpyDeviceToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(tdm,dm,CudaMemory::size(),cudaMemcpyDeviceToDevice));
#else
memcpy(tdm,dm,CudaMemory::size());
CUDA_SAFE_CALL(cudaMemcpy(tdm,dm,CudaMemory::size(),cudaMemcpyDeviceToDevice));
#endif
}
......@@ -333,10 +324,8 @@ bool CudaMemory::resize(size_t sz)
//! copy from the old buffer to the new one
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(thm,hm,CudaMemory::size(),hipMemcpyHostToHost));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(thm,hm,CudaMemory::size(),cudaMemcpyHostToHost));
#else
memcpy(thm,hm,CudaMemory::size());
CUDA_SAFE_CALL(cudaMemcpy(thm,hm,CudaMemory::size(),cudaMemcpyHostToHost));
#endif
}
......@@ -384,10 +373,8 @@ void CudaMemory::deviceToHost()
//! copy from device to host memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(hm,dm,sz,hipMemcpyDeviceToHost));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(hm,dm,sz,cudaMemcpyDeviceToHost));
#else
memcpy(hm,dm,sz);
CUDA_SAFE_CALL(cudaMemcpy(hm,dm,sz,cudaMemcpyDeviceToHost));
#endif
}
......@@ -408,10 +395,8 @@ void CudaMemory::deviceToHost(CudaMemory & mem)
//! copy from device to host memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(mem.hm,dm,mem.sz,hipMemcpyDeviceToHost));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(mem.hm,dm,mem.sz,cudaMemcpyDeviceToHost));
#else
memcpy(mem.hm,dm,mem.sz);
CUDA_SAFE_CALL(cudaMemcpy(mem.hm,dm,mem.sz,cudaMemcpyDeviceToHost));
#endif
}
......@@ -432,10 +417,8 @@ void CudaMemory::hostToDevice(CudaMemory & mem)
//! copy from device to host memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(dm,mem.hm,mem.sz,hipMemcpyHostToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(dm,mem.hm,mem.sz,cudaMemcpyHostToDevice));
#else
memcpy(dm,mem.hm,mem.sz);
CUDA_SAFE_CALL(cudaMemcpy(dm,mem.hm,mem.sz,cudaMemcpyHostToDevice));
#endif
}
......@@ -448,10 +431,8 @@ void CudaMemory::hostToDevice(size_t start, size_t stop)
//! copy from device to host memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(((unsigned char *)dm)+start,((unsigned char *)hm)+start,(stop-start),hipMemcpyHostToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(((unsigned char *)dm)+start,((unsigned char *)hm)+start,(stop-start),cudaMemcpyHostToDevice));
#else
memcpy(((unsigned char *)dm)+start,((unsigned char *)hm)+start,(stop-start));
CUDA_SAFE_CALL(cudaMemcpy(((unsigned char *)dm)+start,((unsigned char *)hm)+start,(stop-start),cudaMemcpyHostToDevice));
#endif
}
......@@ -469,10 +450,8 @@ void CudaMemory::deviceToHost(size_t start, size_t stop)
//! copy from device to host memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(((unsigned char *)hm)+start,((unsigned char *)dm)+start,(stop-start),hipMemcpyDeviceToHost));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(((unsigned char *)hm)+start,((unsigned char *)dm)+start,(stop-start),cudaMemcpyDeviceToHost));
#else
memcpy(((unsigned char *)hm)+start,((unsigned char *)dm)+start,(stop-start));
CUDA_SAFE_CALL(cudaMemcpy(((unsigned char *)hm)+start,((unsigned char *)dm)+start,(stop-start),cudaMemcpyDeviceToHost));
#endif
}
......@@ -535,10 +514,8 @@ void CudaMemory::hostToDevice()
//! copy from device to host memory
#ifdef __HIP__
CUDA_SAFE_CALL(hipMemcpy(dm,hm,sz,hipMemcpyHostToDevice));
#elif defined(CUDIFY_USE_CUDA)
CUDA_SAFE_CALL(cudaMemcpy(dm,hm,sz,cudaMemcpyHostToDevice));
#else
memcpy(dm,hm,sz);
CUDA_SAFE_CALL(cudaMemcpy(dm,hm,sz,cudaMemcpyHostToDevice));
#endif
}
......
......@@ -207,11 +207,11 @@ static cudaError cudaMemcpyToSymbol(unsigned char * global_cuda_error_array, con
return cudaError::cudaSuccess;
}
//static cudaError cudaMemcpy( void* dst, const void* src, size_t count, cudaMemcpyKind kind)
//{
static cudaError cudaMemcpy( void* dst, const void* src, size_t count, cudaMemcpyKind kind)
{
// we copy with OpenMP (when available) the reason stay in avoiding NUMA across threads
// (in multi-socket environment)
/* {
{
char * dst_ = (char *)dst;
char * src_ = (char *)src;
for (int i = 0 ; i < count % 8 ; i++)
......@@ -224,25 +224,22 @@ static cudaError cudaMemcpyToSymbol(unsigned char * global_cuda_error_array, con
double * dst_ = (double *)(((char *)dst) + count % 8);
double * src_ = (double *)(((char *)src) + count % 8);
count /= 8;
#pragma omp parallel for
for (int i = 0 ; i < count ; i++)
{
dst_[i] = src_[i];
}
}*/
// memcpy(dst,src,count);
}
// return cudaError::cudaSuccess;
//}
return cudaError::cudaSuccess;
}
/*static cudaError cudaHostGetDevicePointer( void** pDevice, void* pHost, unsigned int flags)
static cudaError cudaHostGetDevicePointer( void** pDevice, void* pHost, unsigned int flags)
{
*pDevice = pHost;
return cudaError::cudaSuccess;
}*/
}
struct float3
{
......
......@@ -283,7 +283,7 @@ namespace mgpu
extern size_t n_workers;
static bool init_wrappers_call = false;
extern bool init_wrappers_call;
static void init_wrappers()
{
......@@ -660,6 +660,34 @@ static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
}
#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
\
ite_gpu<1> itg;\
itg.wthr = wthr_;\
itg.thr = thr_;\
\
gridDim.x = wthr__.x;\
gridDim.y = wthr__.y;\
gridDim.z = wthr__.z;\
\
blockDim.x = thr__.x;\
blockDim.y = thr__.y;\
blockDim.z = thr__.z;\
\
CHECK_SE_CLASS1_PRE\
std::cout << "Launching: " << #cuda_call << " (" << wthr__.x << "," << wthr__.y << "," << wthr__.z << ") (" << thr__.x << "," << thr__.y << "," << thr__.z << ")" << std::endl;\
\
exe_kernel([&]() -> void {\
\
cuda_call(__VA_ARGS__);\
\
},itg);\
\
}
#define CUDA_CHECK()
#else
......@@ -700,7 +728,7 @@ static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
\
exe_kernel_lambda(lambda_f,ite);\
\
CHECK_SE_CLASS1_POST("lambda")\
CHECK_SE_CLASS1_POST("lambda",0)\
}
#define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\
......@@ -732,6 +760,34 @@ static void exe_kernel_no_sync(lambda_f f, ite_type & ite)
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}
#define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\
{\
dim3 wthr__(wthr_);\
dim3 thr__(thr_);\
\
ite_gpu<1> itg;\
itg.wthr = wthr_;\
itg.thr = thr_;\
\
gridDim.x = wthr__.x;\
gridDim.y = wthr__.y;\
gridDim.z = wthr__.z;\
\
blockDim.x = thr__.x;\
blockDim.y = thr__.y;\
blockDim.z = thr__.z;\
\
CHECK_SE_CLASS1_PRE\
\
\
exe_kernel([&]() -> void {\
\
cuda_call(__VA_ARGS__);\
\
},itg);\
\
}
#define CUDA_LAUNCH_NOSYNC(cuda_call,ite, ...) \
{\
gridDim.x = ite.wthr.x;\
......
......@@ -22,6 +22,7 @@ dim3 gridDim;
#endif
bool init_wrappers_call = false;
thread_local int vct_atomic_add;
thread_local int vct_atomic_rem;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment