From ef72e544ea778b0b043fc10baced7a1e05a1a88a Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Thu, 16 Dec 2021 20:34:15 +0100 Subject: [PATCH] Separate read and write --- example/Performance/memBW/main.cu | 79 ++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu index eb50be832..124fcae6d 100644 --- a/example/Performance/memBW/main.cu +++ b/example/Performance/memBW/main.cu @@ -5,7 +5,7 @@ //! Memory bandwidth with small calculations template<typename vector_type, typename vector_type2> -inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_in) +inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in) { auto p = blockIdx.x * blockDim.x + threadIdx.x; @@ -27,12 +27,34 @@ inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_i } +template<typename vector_type, typename vector_type2> +inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in) +{ + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + float h = vd_in.template get<0>(p)[0]; + float i = vd_in.template get<0>(p)[1]; + + vd_in.template get<0>(p)[0] += a+b+c+d; + vd_in.template get<0>(p)[1] += e+f+g+h+i; +} + int main(int argc, char *argv[]) { init_wrappers(); - openfpm::vector_gpu<aggregate<double,double[2],double[2][2]>> out; - openfpm::vector_gpu<aggregate<double[2]>> in; + openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out; + openfpm::vector_gpu<aggregate<float[2]>> in; int nele = 16777216; @@ -52,29 +74,54 @@ int main(int argc, char *argv[]) for (int i = 0 ; i < 101 ; i++) { - cudaDeviceSynchronize(); - timer t; - t.start(); + cudaDeviceSynchronize(); + timer t; + t.start(); + + CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel()); - CUDA_LAUNCH(translate_fill_prop,ite,out.toKernel(),in.toKernel()); + cudaDeviceSynchronize(); + t.stop(); + + if (i >=1) + {res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_write = 0.0; + double dev_write = 0.0; + standard_deviation(res,mean_write,dev_write); + + for (int i = 0 ; i < 101 ; i++) + { cudaDeviceSynchronize(); + timer t; + t.start(); + + + CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel()); + + cudaDeviceSynchronize(); - t.stop(); + t.stop(); - if (i >=1) - {res.get(i-1) = nele*8*11 / t.getwct() * 1e-9;} + if (i >=1) + {res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;} - std::cout << "Time: " << t.getwct() << std::endl; - std::cout << "BW: " << nele*8*11 / t.getwct() * 1e-9 << " GB/s" << std::endl; + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s" << std::endl; } - double mean = 0.0; - double dev = 0.0; - standard_deviation(res,mean,dev); + double mean_read = 0.0; + double dev_read = 0.0; + standard_deviation(res,mean_read,dev_read); - std::cout << "Average: " << mean << " deviation: " << dev << std::endl; + std::cout << "Average READ: " << mean_read << " deviation: " << dev_read << std::endl; + std::cout << "Average WRITE: " << mean_write << " deviation: " << dev_write << std::endl; } #else -- GitLab