From a232b73e68bc0786631277aa65a883b56e439e1c Mon Sep 17 00:00:00 2001
From: Incardona Pietro <incardon@mpi-cbg.de>
Date: Sun, 19 Dec 2021 16:02:36 +0100
Subject: [PATCH] Fixing memBW

---
 example/Performance/memBW/main.cu             | 252 +++++++++++++++---
 src/Grid/grid_dist_id.hpp                     |   2 +-
 .../tests/sgrid_dist_id_gpu_unit_tests.cu     |  58 ++++
 3 files changed, 279 insertions(+), 33 deletions(-)

diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index dfa89210d..adb19adfc 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -21,9 +21,6 @@ inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type
 	vd_out.template get<2>(p)[0][1] = b;
 	vd_out.template get<2>(p)[1][0] = a + b;
 	vd_out.template get<2>(p)[1][1] = b - a;
-
-	vd_in.template get<0>(p)[0] += a;
-	vd_in.template get<0>(p)[1] += b;
 }
 
 
@@ -37,6 +34,50 @@ inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2
 	float b = vd_out.template get<1>(p)[0];
 	float c = vd_out.template get<1>(p)[1];
 
+	float d = vd_out.template get<2>(p)[0][0];
+	float e = vd_out.template get<2>(p)[0][1];
+	float f = vd_out.template get<2>(p)[1][0];
+	float g = vd_out.template get<2>(p)[1][1];
+    
+	vd_in.template get<0>(p)[0] = a+b+c+d;
+	vd_in.template get<0>(p)[1] = e+f+g;
+}
+
+/////////////////////////////// Lambda based
+
+template<typename vector_type, typename vector_type2>
+inline __device__ void translate_fill_prop_write_notls(vector_type vd_out, vector_type2 vd_in, dim3 & blockIdx, dim3 & threadIdx)
+{
+	auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+	float a = vd_in.template get<0>(p)[0];
+	float b = vd_in.template get<0>(p)[1];
+
+	vd_out.template get<0>(p) = a + b;
+
+	vd_out.template get<1>(p)[0] = a;
+	vd_out.template get<1>(p)[1] = b;
+
+	vd_out.template get<2>(p)[0][0] = a;
+	vd_out.template get<2>(p)[0][1] = b;
+	vd_out.template get<2>(p)[1][0] = a + b;
+	vd_out.template get<2>(p)[1][1] = b - a;
+
+	vd_in.template get<0>(p)[0] = a;
+	vd_in.template get<0>(p)[1] = b;
+}
+
+
+template<typename vector_type, typename vector_type2>
+inline __device__ void translate_fill_prop_read_notls(vector_type vd_out, vector_type2 vd_in, dim3 & blockIdx, dim3 & threadIdx)
+{
+	auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+	float a = vd_out.template get<0>(p);
+
+	float b = vd_out.template get<1>(p)[0];
+	float c = vd_out.template get<1>(p)[1];
+
 	float d = vd_out.template get<2>(p)[0][0];
 	float e = vd_out.template get<2>(p)[0][1];
 	float f = vd_out.template get<2>(p)[1][0];
@@ -49,6 +90,50 @@ inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2
 	vd_in.template get<0>(p)[1] = e+f+g+h+i;
 }
 
+// Arrays
+
+inline __global__ void translate_fill_prop_write_array(float * vd_out_scal,
+                                                       float * vd_out_vec,
+                                                       float * vd_out_mat,
+                                                       float * vd_in_vec,
+                                                       int stride)
+{
+	auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+	float a = vd_in_vec[p* + 0*stride];
+	float b = vd_in_vec[p* + 1*stride];
+
+	vd_out_scal[p] = a + b;
+
+	vd_out_vec[p + 0*stride] = a;
+	vd_out_vec[p + 1*stride] = b;
+
+	vd_out_mat[p + 0*2*stride + 0*stride ] = a;
+	vd_out_mat[p + 0*2*stride + 1*stride ] = b;
+	vd_out_mat[p + 1*2*stride + 0*stride ] = a + b;
+	vd_out_mat[p + 1*2*stride + 1*stride ] = b - a;
+}
+
+
+template<typename vector_type, typename vector_type2>
+inline __global__ void translate_fill_prop_read_array(vector_type vd_out, vector_type2 vd_in)
+{
+	auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+	float a = vd_out.template get<0>(p);
+
+	float b = vd_out.template get<1>(p)[0];
+	float c = vd_out.template get<1>(p)[1];
+
+	float d = vd_out.template get<2>(p)[0][0];
+	float e = vd_out.template get<2>(p)[0][1];
+	float f = vd_out.template get<2>(p)[1][0];
+	float g = vd_out.template get<2>(p)[1][1];
+    
+	vd_in.template get<0>(p)[0] = a+b+c+d;
+	vd_in.template get<0>(p)[1] = e+f+g;
+}
+
 int main(int argc, char *argv[])
 {
     init_wrappers();
@@ -67,61 +152,164 @@ int main(int argc, char *argv[])
         in.template get<0>(i)[1] = i+100.0;
     }
 
+    // Read write test with TLS
+
     auto ite = out.getGPUIterator(256);
 
     openfpm::vector<double> res;
     res.resize(100);
 
-    for (int i = 0 ; i < 101 ; i++)
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+
+        CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write_tls = 0.0;
+    double dev_write_tls = 0.0;
+    standard_deviation(res,mean_write_tls,dev_write_tls);
+
+    for (int i = 0 ; i < 110 ; i++)
     {
         cudaDeviceSynchronize();
-            timer t;
-            t.start();
+        timer t;
+        t.start();
 
 
-            CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
+        CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
 
-            cudaDeviceSynchronize();
+        cudaDeviceSynchronize();
 
-            t.stop();
+        t.stop();
 
-        if (i >=1)
-        {res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;}
+        if (i >=10)
+        {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;}
 
-            std::cout << "Time: " << t.getwct() << std::endl;
-            std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
     }
 
-    double mean_write = 0.0;
-    double dev_write = 0.0;
-    standard_deviation(res,mean_write,dev_write);
+    double mean_read_tls = 0.0;
+    double dev_read_tls = 0.0;
+    standard_deviation(res,mean_read_tls,dev_read_tls);
+
+    /////////////////////////////////////////// LAMBDA //////////////////////////////////////////
+
 
-    for (int i = 0 ; i < 101 ; i++)
+    for (int i = 0 ; i < 110 ; i++)
     {
-            cudaDeviceSynchronize();
-            timer t;
-            t.start();
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+        auto vd_out = out.toKernel();
+        auto vd_in = in.toKernel();
+
+        auto lamb = [&] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+        {
+            auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+            float a = vd_out.template get<0>(p);
+        
+            float b = vd_out.template get<1>(p)[0];
+            float c = vd_out.template get<1>(p)[1];
+        
+            float d = vd_out.template get<2>(p)[0][0];
+            float e = vd_out.template get<2>(p)[0][1];
+            float f = vd_out.template get<2>(p)[1][0];
+            float g = vd_out.template get<2>(p)[1][1];
+        
+            float h = vd_in.template get<0>(p)[0];
+            float i = vd_in.template get<0>(p)[1];
+            
+            vd_in.template get<0>(p)[0] = a+b+c+d;
+            vd_in.template get<0>(p)[1] = e+f+g+h+i;
+        };
+
+        CUDA_LAUNCH_LAMBDA(ite, lamb);
+
+        cudaDeviceSynchronize();
 
+        t.stop();
 
-            CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
+        if (i >=10)
+        {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;}
 
-            cudaDeviceSynchronize();
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
 
-            t.stop();
+    double mean_write_lamb = 0.0;
+    double dev_write_lamb = 0.0;
+    standard_deviation(res,mean_write_lamb,dev_write_lamb);
 
-        if (i >=1)
-        {res.get(i-1) = nele*4*11 / t.getwct() * 1e-9;}
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+
+        auto vd_out = out.toKernel();
+        auto vd_in = in.toKernel();
+
+        auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+                            {
+                                auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+                                float a = vd_out.template get<0>(p);
+                            
+                                float b = vd_out.template get<1>(p)[0];
+                                float c = vd_out.template get<1>(p)[1];
+                            
+                                float d = vd_out.template get<2>(p)[0][0];
+                                float e = vd_out.template get<2>(p)[0][1];
+                                float f = vd_out.template get<2>(p)[1][0];
+                                float g = vd_out.template get<2>(p)[1][1];
+                            
+                                float h = vd_in.template get<0>(p)[0];
+                                float i = vd_in.template get<0>(p)[1];
+                                
+                                vd_in.template get<0>(p)[0] = a+b+c+d;
+                                vd_in.template get<0>(p)[1] = e+f+g+h+i;
+                            };
+
+        CUDA_LAUNCH_LAMBDA(ite, lamb);
 
-            std::cout << "Time: " << t.getwct() << std::endl;
-            std::cout << "BW: " << nele*4*11 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
     }
 
-    double mean_read = 0.0;
-    double dev_read = 0.0;
-    standard_deviation(res,mean_read,dev_read);
+    double mean_read_lamb = 0.0;
+    double dev_read_lamb = 0.0;
+    standard_deviation(res,mean_read_lamb,dev_read_lamb);
+
+    std::cout << "Average READ with TLS: " << mean_read_tls << "  deviation: " << dev_read_tls << std::endl;
+    std::cout << "Average WRITE with TLS: " << mean_write_tls << "  deviation: " << dev_write_tls << std::endl;
 
-    std::cout << "Average READ: " << mean_read << "  deviation: " << dev_read << std::endl;
-    std::cout << "Average WRITE: " << mean_write << "  deviation: " << dev_write << std::endl;
+    std::cout << "Average READ with lamb: " << mean_read_lamb << "  deviation: " << dev_read_lamb << std::endl;
+    std::cout << "Average WRITE with lamb: " << mean_write_lamb << "  deviation: " << dev_write_lamb << std::endl;
 }
 
 #else
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 05536b537..6007fec2f 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -3248,7 +3248,7 @@ public:
 							for (int j = 0 ; j < dim ; j++)
 							{key_dst.set_d(j,key.get(j) + orig.get(j) + kp1.get(j));}
 
-							dg.get_o(key_dst) = lg.get_o(key);
+							dg.insert_o(key_dst) = lg.get_o(key);
 
 							++it_src;
 					}
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index 824f10713..cb5761cf1 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -26,6 +26,8 @@ struct insert_kernel2D
 	}
 };
 
+
+
 template<unsigned int p>
 struct insert_kernel3D
 {
@@ -184,6 +186,62 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output )
 	#endif
 }
 
+
+BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
+{
+/*	auto & v_cl = create_vcluster();
+
+	if (v_cl.size() > 3){return;}
+
+	size_t sz[2] = {17,17};
+	periodicity<2> bc = {PERIODIC,PERIODIC};
+
+	Ghost<2,long int> g(1);
+
+	Box<2,float> domain({0.0,0.0},{1.0,1.0});
+
+	sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist(sz,domain,g,bc);
+
+	gdist.template setBackgroundValue<0>(666);
+
+	/////// GPU insert + flush
+
+	Box<2,size_t> box({1,1},{15,15});
+	auto it = gdist.getGridIterator(box.getKP1(),box.getKP2());
+
+	/////// GPU Run kernel
+
+	typedef typename GetAddBlockType<decltype(gdist)>::type InsertBlockT;
+
+	float c = 5.0;
+
+	gdist.addPoints([] __device__ (int i, int j)
+			        {
+						return true;
+			        },
+			        [c] __device__ (InsertBlockT & data, int i, int j)
+			        {
+			        	data.template get<0>() = c + i + j;
+						data.template get<1>() = c + 1000 + i + j;
+						
+						data.template get<2>()[0] = i;
+						data.template get<2>()[1] = j;
+			        }
+			        );
+
+	gdist.template flush<smax_<0>,smax_<1>,smax_<2>>(flush_type::FLUSH_ON_DEVICE);
+
+	gdist.template deviceToHost<0>();
+
+	gdist.save("sgrid_gpu_output_hdf5");
+
+	// Now load
+
+	sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist2(sz,domain,g,bc);
+
+	gdist2.load("sgrid_gpu_output_hdf5");*/
+}
+
 void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2])
 {
 	periodicity<2> bc = {PERIODIC,PERIODIC};
-- 
GitLab