From a232b73e68bc0786631277aa65a883b56e439e1c Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Sun, 19 Dec 2021 16:02:36 +0100 Subject: [PATCH] Fixing memBW --- example/Performance/memBW/main.cu | 252 +++++++++++++++--- src/Grid/grid_dist_id.hpp | 2 +- .../tests/sgrid_dist_id_gpu_unit_tests.cu | 58 ++++ 3 files changed, 279 insertions(+), 33 deletions(-) diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu index dfa89210d..adb19adfc 100644 --- a/example/Performance/memBW/main.cu +++ b/example/Performance/memBW/main.cu @@ -21,9 +21,6 @@ inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type vd_out.template get<2>(p)[0][1] = b; vd_out.template get<2>(p)[1][0] = a + b; vd_out.template get<2>(p)[1][1] = b - a; - - vd_in.template get<0>(p)[0] += a; - vd_in.template get<0>(p)[1] += b; } @@ -37,6 +34,50 @@ inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 float b = vd_out.template get<1>(p)[0]; float c = vd_out.template get<1>(p)[1]; + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + vd_in.template get<0>(p)[0] = a+b+c+d; + vd_in.template get<0>(p)[1] = e+f+g; +} + +/////////////////////////////// Lambda based + +template<typename vector_type, typename vector_type2> +inline __device__ void translate_fill_prop_write_notls(vector_type vd_out, vector_type2 vd_in, dim3 & blockIdx, dim3 & threadIdx) +{ + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_in.template get<0>(p)[0]; + float b = vd_in.template get<0>(p)[1]; + + vd_out.template get<0>(p) = a + b; + + vd_out.template get<1>(p)[0] = a; + vd_out.template get<1>(p)[1] = b; + + vd_out.template get<2>(p)[0][0] = a; + vd_out.template get<2>(p)[0][1] = b; + vd_out.template get<2>(p)[1][0] = a + b; + vd_out.template get<2>(p)[1][1] = b - a; + + vd_in.template get<0>(p)[0] = a; + vd_in.template get<0>(p)[1] = b; +} + + +template<typename vector_type, typename vector_type2> +inline __device__ void translate_fill_prop_read_notls(vector_type vd_out, vector_type2 vd_in, dim3 & blockIdx, dim3 & threadIdx) +{ + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + float d = vd_out.template get<2>(p)[0][0]; float e = vd_out.template get<2>(p)[0][1]; float f = vd_out.template get<2>(p)[1][0]; @@ -49,6 +90,50 @@ inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in.template get<0>(p)[1] = e+f+g+h+i; } +// Arrays + +inline __global__ void translate_fill_prop_write_array(float * vd_out_scal, + float * vd_out_vec, + float * vd_out_mat, + float * vd_in_vec, + int stride) +{ + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_in_vec[p* + 0*stride]; + float b = vd_in_vec[p* + 1*stride]; + + vd_out_scal[p] = a + b; + + vd_out_vec[p + 0*stride] = a; + vd_out_vec[p + 1*stride] = b; + + vd_out_mat[p + 0*2*stride + 0*stride ] = a; + vd_out_mat[p + 0*2*stride + 1*stride ] = b; + vd_out_mat[p + 1*2*stride + 0*stride ] = a + b; + vd_out_mat[p + 1*2*stride + 1*stride ] = b - a; +} + + +template<typename vector_type, typename vector_type2> +inline __global__ void translate_fill_prop_read_array(vector_type vd_out, vector_type2 vd_in) +{ + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + vd_in.template get<0>(p)[0] = a+b+c+d; + vd_in.template get<0>(p)[1] = e+f+g; +} + int main(int argc, char *argv[]) { init_wrappers(); @@ -67,61 +152,164 @@ int main(int argc, char *argv[]) in.template get<0>(i)[1] = i+100.0; } + // Read write test with TLS + auto ite = out.getGPUIterator(256); openfpm::vector<double> res; res.resize(100); - for (int i = 0 ; i < 101 ; i++) + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + + CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel()); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_write_tls = 0.0; + double dev_write_tls = 0.0; + standard_deviation(res,mean_write_tls,dev_write_tls); + + for (int i = 0 ; i < 110 ; i++) { cudaDeviceSynchronize(); - timer t; - t.start(); + timer t; + t.start(); - CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel()); + CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel()); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); - t.stop(); + t.stop(); - if (i >=1) - {res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;} + if (i >=10) + {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;} - std::cout << "Time: " << t.getwct() << std::endl; - std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s" << std::endl; + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; } - double mean_write = 0.0; - double dev_write = 0.0; - standard_deviation(res,mean_write,dev_write); + double mean_read_tls = 0.0; + double dev_read_tls = 0.0; + standard_deviation(res,mean_read_tls,dev_read_tls); + + /////////////////////////////////////////// LAMBDA ////////////////////////////////////////// + - for (int i = 0 ; i < 101 ; i++) + for (int i = 0 ; i < 110 ; i++) { - cudaDeviceSynchronize(); - timer t; - t.start(); + cudaDeviceSynchronize(); + timer t; + t.start(); + + auto vd_out = out.toKernel(); + auto vd_in = in.toKernel(); + + auto lamb = [&] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + float h = vd_in.template get<0>(p)[0]; + float i = vd_in.template get<0>(p)[1]; + + vd_in.template get<0>(p)[0] = a+b+c+d; + vd_in.template get<0>(p)[1] = e+f+g+h+i; + }; + + CUDA_LAUNCH_LAMBDA(ite, lamb); + + cudaDeviceSynchronize(); + t.stop(); - CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel()); + if (i >=10) + {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;} - cudaDeviceSynchronize(); + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } - t.stop(); + double mean_write_lamb = 0.0; + double dev_write_lamb = 0.0; + standard_deviation(res,mean_write_lamb,dev_write_lamb); - if (i >=1) - {res.get(i-1) = nele*4*11 / t.getwct() * 1e-9;} + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + + auto vd_out = out.toKernel(); + auto vd_in = in.toKernel(); + + auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + float h = vd_in.template get<0>(p)[0]; + float i = vd_in.template get<0>(p)[1]; + + vd_in.template get<0>(p)[0] = a+b+c+d; + vd_in.template get<0>(p)[1] = e+f+g+h+i; + }; + + CUDA_LAUNCH_LAMBDA(ite, lamb); - std::cout << "Time: " << t.getwct() << std::endl; - std::cout << "BW: " << nele*4*11 / t.getwct() * 1e-9 << " GB/s" << std::endl; + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; } - double mean_read = 0.0; - double dev_read = 0.0; - standard_deviation(res,mean_read,dev_read); + double mean_read_lamb = 0.0; + double dev_read_lamb = 0.0; + standard_deviation(res,mean_read_lamb,dev_read_lamb); + + std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl; + std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl; - std::cout << "Average READ: " << mean_read << " deviation: " << dev_read << std::endl; - std::cout << "Average WRITE: " << mean_write << " deviation: " << dev_write << std::endl; + std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl; + std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl; } #else diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp index 05536b537..6007fec2f 100644 --- a/src/Grid/grid_dist_id.hpp +++ b/src/Grid/grid_dist_id.hpp @@ -3248,7 +3248,7 @@ public: for (int j = 0 ; j < dim ; j++) {key_dst.set_d(j,key.get(j) + orig.get(j) + kp1.get(j));} - dg.get_o(key_dst) = lg.get_o(key); + dg.insert_o(key_dst) = lg.get_o(key); ++it_src; } diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu index 824f10713..cb5761cf1 100644 --- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu +++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu @@ -26,6 +26,8 @@ struct insert_kernel2D } }; + + template<unsigned int p> struct insert_kernel3D { @@ -184,6 +186,62 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output ) #endif } + +BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load ) +{ +/* auto & v_cl = create_vcluster(); + + if (v_cl.size() > 3){return;} + + size_t sz[2] = {17,17}; + periodicity<2> bc = {PERIODIC,PERIODIC}; + + Ghost<2,long int> g(1); + + Box<2,float> domain({0.0,0.0},{1.0,1.0}); + + sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist(sz,domain,g,bc); + + gdist.template setBackgroundValue<0>(666); + + /////// GPU insert + flush + + Box<2,size_t> box({1,1},{15,15}); + auto it = gdist.getGridIterator(box.getKP1(),box.getKP2()); + + /////// GPU Run kernel + + typedef typename GetAddBlockType<decltype(gdist)>::type InsertBlockT; + + float c = 5.0; + + gdist.addPoints([] __device__ (int i, int j) + { + return true; + }, + [c] __device__ (InsertBlockT & data, int i, int j) + { + data.template get<0>() = c + i + j; + data.template get<1>() = c + 1000 + i + j; + + data.template get<2>()[0] = i; + data.template get<2>()[1] = j; + } + ); + + gdist.template flush<smax_<0>,smax_<1>,smax_<2>>(flush_type::FLUSH_ON_DEVICE); + + gdist.template deviceToHost<0>(); + + gdist.save("sgrid_gpu_output_hdf5"); + + // Now load + + sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist2(sz,domain,g,bc); + + gdist2.load("sgrid_gpu_output_hdf5");*/ +} + void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2]) { periodicity<2> bc = {PERIODIC,PERIODIC}; -- GitLab