From ef72e544ea778b0b043fc10baced7a1e05a1a88a Mon Sep 17 00:00:00 2001
From: Incardona Pietro <incardon@mpi-cbg.de>
Date: Thu, 16 Dec 2021 20:34:15 +0100
Subject: [PATCH] Separate read and write

---
 example/Performance/memBW/main.cu | 79 ++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 16 deletions(-)

diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index eb50be832..124fcae6d 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -5,7 +5,7 @@
 
 //! Memory bandwidth with small calculations
 template<typename vector_type, typename vector_type2>
-inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_in)
+inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
 {
 	auto p = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -27,12 +27,34 @@ inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_i
 }
 
 
+template<typename vector_type, typename vector_type2>
+inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
+{
+	auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+	float a = vd_out.template get<0>(p);
+
+	float b = vd_out.template get<1>(p)[0];
+	float c = vd_out.template get<1>(p)[1];
+
+	float d = vd_out.template get<2>(p)[0][0];
+	float e = vd_out.template get<2>(p)[0][1];
+	float f = vd_out.template get<2>(p)[1][0];
+	float g = vd_out.template get<2>(p)[1][1];
+
+	float h = vd_in.template get<0>(p)[0];
+    float i = vd_in.template get<0>(p)[1];
+    
+	vd_in.template get<0>(p)[0] += a+b+c+d;
+	vd_in.template get<0>(p)[1] += e+f+g+h+i;
+}
+
 int main(int argc, char *argv[])
 {
     init_wrappers();
 
-    openfpm::vector_gpu<aggregate<double,double[2],double[2][2]>> out;
-    openfpm::vector_gpu<aggregate<double[2]>> in;
+    openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out;
+    openfpm::vector_gpu<aggregate<float[2]>> in;
 
     int nele = 16777216;
 
@@ -52,29 +74,54 @@ int main(int argc, char *argv[])
 
     for (int i = 0 ; i < 101 ; i++)
     {
-	cudaDeviceSynchronize();
-        timer t;
-        t.start();
+        cudaDeviceSynchronize();
+            timer t;
+            t.start();
+
 
+            CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
 
-        CUDA_LAUNCH(translate_fill_prop,ite,out.toKernel(),in.toKernel());
+            cudaDeviceSynchronize();
 
+            t.stop();
+
+        if (i >=1)
+        {res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;}
+
+            std::cout << "Time: " << t.getwct() << std::endl;
+            std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write = 0.0;
+    double dev_write = 0.0;
+    standard_deviation(res,mean_write,dev_write);
+
+    for (int i = 0 ; i < 101 ; i++)
+    {
         cudaDeviceSynchronize();
+            timer t;
+            t.start();
+
+
+            CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
+
+            cudaDeviceSynchronize();
 
-        t.stop();
+            t.stop();
 
-	if (i >=1)
-	{res.get(i-1) = nele*8*11 / t.getwct() * 1e-9;}
+        if (i >=1)
+        {res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;}
 
-        std::cout << "Time: " << t.getwct() << std::endl;
-        std::cout << "BW: " << nele*8*11 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+            std::cout << "Time: " << t.getwct() << std::endl;
+            std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
     }
 
-    double mean = 0.0;
-    double dev = 0.0;
-    standard_deviation(res,mean,dev);
+    double mean_read = 0.0;
+    double dev_read = 0.0;
+    standard_deviation(res,mean_read,dev_read);
 
-    std::cout << "Average: " << mean << "  deviation: " << dev << std::endl;
+    std::cout << "Average READ: " << mean_read << "  deviation: " << dev_read << std::endl;
+    std::cout << "Average WRITE: " << mean_write << "  deviation: " << dev_write << std::endl;
 }
 
 #else
-- 
GitLab