From be6e77320265aaed7f56cf9e420d8cfa0fe6571f Mon Sep 17 00:00:00 2001
From: Incardona Pietro <incardon@mpi-cbg.de>
Date: Tue, 28 Dec 2021 23:21:15 +0100
Subject: [PATCH] Optimizing TLS version

---
 example/Performance/memBW/main.cu | 22 ++++++++++------------
 openfpm_devices                   |  2 +-
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index 9baf892f1..ac82536fa 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -227,18 +227,16 @@ int main(int argc, char *argv[])
         {
             auto p = blockIdx.x * blockDim.x + threadIdx.x;
 
-            float a = vd_in.template get<0>(p)[0];
-            float b = vd_in.template get<0>(p)[1];
-
-	    vd_out.template get<0>(p) = a + b;
+            vd_out.template get<0>(p) = a;
 
             vd_out.template get<1>(p)[0] = a;
-            vd_out.template get<1>(p)[1] = b;
-
+            vd_out.template get<1>(p)[1] = a;
+        
             vd_out.template get<2>(p)[0][0] = a;
-            vd_out.template get<2>(p)[0][1] = b;
-            vd_out.template get<2>(p)[1][0] = a + b;
-            vd_out.template get<2>(p)[1][1] = b - a;
+            vd_out.template get<2>(p)[0][1] = a;
+            vd_out.template get<2>(p)[1][0] = a;
+            vd_out.template get<2>(p)[1][1] = a;
+            vd_in.template get<0>(p)[1] = a;
         };
 
         CUDA_LAUNCH_LAMBDA(ite, lamb);
@@ -275,7 +273,7 @@ int main(int argc, char *argv[])
                                 auto p = blockIdx.x * blockDim.x + threadIdx.x;
 
                                 float a = vd_out.template get<0>(p);
-                            
+
                                 float b = vd_out.template get<1>(p)[0];
                                 float c = vd_out.template get<1>(p)[1];
                             
@@ -284,8 +282,8 @@ int main(int argc, char *argv[])
                                 float f = vd_out.template get<2>(p)[1][0];
                                 float g = vd_out.template get<2>(p)[1][1];
                                 
-                                vd_in.template get<0>(p)[0] = a+b+c+d;
-                                vd_in.template get<0>(p)[1] = e+f+g;
+                                float h = vd_in.template get<0>(p)[0];
+                                vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
                             };
 
         CUDA_LAUNCH_LAMBDA(ite, lamb);
diff --git a/openfpm_devices b/openfpm_devices
index 6d8f28cc1..c4dd3cffa 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit 6d8f28cc101c2350975ff1a3e82127319c03e392
+Subproject commit c4dd3cffacb260372766b760bcbcab6815167b73
-- 
GitLab