From bb9266fa0731efc26118b7158ee40c9d1bc90fc5 Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Sun, 19 Dec 2021 20:22:26 +0100 Subject: [PATCH] Memory bandwidth with lamnda --- example/Performance/memBW/main.cu | 190 +++++++++++++++++++----------- 1 file changed, 119 insertions(+), 71 deletions(-) diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu index adb19adfc..0738bcf4f 100644 --- a/example/Performance/memBW/main.cu +++ b/example/Performance/memBW/main.cu @@ -5,7 +5,7 @@ //! Memory bandwidth with small calculations template<typename vector_type, typename vector_type2> -inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in) +__global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in) { auto p = blockIdx.x * blockDim.x + threadIdx.x; @@ -25,7 +25,7 @@ inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type template<typename vector_type, typename vector_type2> -inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in) +__global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in) { auto p = blockIdx.x * blockDim.x + threadIdx.x; @@ -43,56 +43,9 @@ inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in.template get<0>(p)[1] = e+f+g; } -/////////////////////////////// Lambda based - -template<typename vector_type, typename vector_type2> -inline __device__ void translate_fill_prop_write_notls(vector_type vd_out, vector_type2 vd_in, dim3 & blockIdx, dim3 & threadIdx) -{ - auto p = blockIdx.x * blockDim.x + threadIdx.x; - - float a = vd_in.template get<0>(p)[0]; - float b = vd_in.template get<0>(p)[1]; - - vd_out.template get<0>(p) = a + b; - - vd_out.template get<1>(p)[0] = a; - vd_out.template get<1>(p)[1] = b; - - vd_out.template get<2>(p)[0][0] = a; - vd_out.template get<2>(p)[0][1] = b; - vd_out.template get<2>(p)[1][0] = a + b; - vd_out.template get<2>(p)[1][1] = b - a; - - vd_in.template get<0>(p)[0] = a; - vd_in.template get<0>(p)[1] = b; -} - - -template<typename vector_type, typename vector_type2> -inline __device__ void translate_fill_prop_read_notls(vector_type vd_out, vector_type2 vd_in, dim3 & blockIdx, dim3 & threadIdx) -{ - auto p = blockIdx.x * blockDim.x + threadIdx.x; - - float a = vd_out.template get<0>(p); - - float b = vd_out.template get<1>(p)[0]; - float c = vd_out.template get<1>(p)[1]; - - float d = vd_out.template get<2>(p)[0][0]; - float e = vd_out.template get<2>(p)[0][1]; - float f = vd_out.template get<2>(p)[1][0]; - float g = vd_out.template get<2>(p)[1][1]; - - float h = vd_in.template get<0>(p)[0]; - float i = vd_in.template get<0>(p)[1]; - - vd_in.template get<0>(p)[0] = a+b+c+d; - vd_in.template get<0>(p)[1] = e+f+g+h+i; -} - // Arrays -inline __global__ void translate_fill_prop_write_array(float * vd_out_scal, +__global__ void translate_fill_prop_write_array(float * vd_out_scal, float * vd_out_vec, float * vd_out_mat, float * vd_in_vec, @@ -116,7 +69,7 @@ inline __global__ void translate_fill_prop_write_array(float * vd_out_scal, template<typename vector_type, typename vector_type2> -inline __global__ void translate_fill_prop_read_array(vector_type vd_out, vector_type2 vd_in) +__global__ void translate_fill_prop_read_array(vector_type vd_out, vector_type2 vd_in) { auto p = blockIdx.x * blockDim.x + threadIdx.x; @@ -134,6 +87,66 @@ inline __global__ void translate_fill_prop_read_array(vector_type vd_out, vector vd_in.template get<0>(p)[1] = e+f+g; } +template<typename in_type, typename out_type> +void check_write(in_type & in, out_type & out) +{ + out.template deviceToHost<0,1,2>(); + in.template deviceToHost<0>(); + + bool success = true; + for (int i = 0 ; i < 16777216 ; i++) + { + float a = in.template get<0>(i)[0]; + float b = in.template get<0>(i)[1]; + + success &= out.template get<0>(i) == a + b; + + success &= out.template get<1>(i)[0] == a; + success &= out.template get<1>(i)[1] == b; + + success &= out.template get<2>(i)[0][0] == a; + success &= out.template get<2>(i)[0][1] == b; + success &= out.template get<2>(i)[1][0] == a + b; + success &= out.template get<2>(i)[1][1] == b - a; + } + + if (success == false) + { + std::cout << "FAIL" << std::endl; + exit(1); + } +} + +template<typename in_type, typename out_type> +void check_read(in_type & in, out_type & out) +{ + out.template deviceToHost<0,1,2>(); + in.template deviceToHost<0>(); + + bool success = true; + for (int i = 0 ; i < 16777216 ; i++) + { + float a = out.template get<0>(i); + + float b = out.template get<1>(i)[0]; + float c = out.template get<1>(i)[1]; + + float d = out.template get<2>(i)[0][0]; + float e = out.template get<2>(i)[0][1]; + float f = out.template get<2>(i)[1][0]; + float g = out.template get<2>(i)[1][1]; + + success &= in.template get<0>(i)[0] == (a+b+c+d); + success &= in.template get<0>(i)[1] == (e+f+g); + } + + if (success == false) + { + std::cout << "FAIL" << std::endl; + exit(1); + } +} + int main(int argc, char *argv[]) { init_wrappers(); @@ -159,6 +172,8 @@ int main(int argc, char *argv[]) openfpm::vector<double> res; res.resize(100); + in.hostToDevice<0>(); + for (int i = 0 ; i < 110 ; i++) { cudaDeviceSynchronize(); @@ -183,6 +198,8 @@ int main(int argc, char *argv[]) double dev_write_tls = 0.0; standard_deviation(res,mean_write_tls,dev_write_tls); + check_write(in,out); + for (int i = 0 ; i < 110 ; i++) { cudaDeviceSynchronize(); @@ -207,6 +224,10 @@ int main(int argc, char *argv[]) double dev_read_tls = 0.0; standard_deviation(res,mean_read_tls,dev_read_tls); + check_read(in,out); + + ////////////// + /////////////////////////////////////////// LAMBDA ////////////////////////////////////////// @@ -219,25 +240,22 @@ int main(int argc, char *argv[]) auto vd_out = out.toKernel(); auto vd_in = in.toKernel(); - auto lamb = [&] __device__ (dim3 & blockIdx, dim3 & threadIdx) + auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx) { auto p = blockIdx.x * blockDim.x + threadIdx.x; - float a = vd_out.template get<0>(p); - - float b = vd_out.template get<1>(p)[0]; - float c = vd_out.template get<1>(p)[1]; - - float d = vd_out.template get<2>(p)[0][0]; - float e = vd_out.template get<2>(p)[0][1]; - float f = vd_out.template get<2>(p)[1][0]; - float g = vd_out.template get<2>(p)[1][1]; - - float h = vd_in.template get<0>(p)[0]; - float i = vd_in.template get<0>(p)[1]; - - vd_in.template get<0>(p)[0] = a+b+c+d; - vd_in.template get<0>(p)[1] = e+f+g+h+i; + float a = vd_in.template get<0>(p)[0]; + float b = vd_in.template get<0>(p)[1]; + + vd_out.template get<0>(p) = a + b; + + vd_out.template get<1>(p)[0] = a; + vd_out.template get<1>(p)[1] = b; + + vd_out.template get<2>(p)[0][0] = a; + vd_out.template get<2>(p)[0][1] = b; + vd_out.template get<2>(p)[1][0] = a + b; + vd_out.template get<2>(p)[1][1] = b - a; }; CUDA_LAUNCH_LAMBDA(ite, lamb); @@ -280,12 +298,9 @@ int main(int argc, char *argv[]) float e = vd_out.template get<2>(p)[0][1]; float f = vd_out.template get<2>(p)[1][0]; float g = vd_out.template get<2>(p)[1][1]; - - float h = vd_in.template get<0>(p)[0]; - float i = vd_in.template get<0>(p)[1]; vd_in.template get<0>(p)[0] = a+b+c+d; - vd_in.template get<0>(p)[1] = e+f+g+h+i; + vd_in.template get<0>(p)[1] = e+f+g; }; CUDA_LAUNCH_LAMBDA(ite, lamb); @@ -305,11 +320,44 @@ int main(int argc, char *argv[]) double dev_read_lamb = 0.0; standard_deviation(res,mean_read_lamb,dev_read_lamb); + #ifdef CUDIFY_USE_CUDA + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + float * a = (float *)in.getDeviceBuffer<0>(); + float * b = (float *)out.getDeviceBuffer<1>(); + + cudaMemcpy(a,b,2*16777216*4,cudaMemcpyDeviceToDevice); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = nele*4*4 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*4 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_read_mes = 0.0; + double dev_read_mes = 0.0; + standard_deviation(res,mean_read_mes,dev_read_mes); + + std::cout << "Average measured: " << mean_read_mes << " deviation: " << dev_read_mes << std::endl; + + #endif + std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl; std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl; std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl; std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl; + } #else -- GitLab