Optimizing TLS version

be6e7732 · Pietro Incardona · 74309603 · be6e7732 · c4dd3cff
Commit be6e7732 authored 3 years ago by Pietro Incardona
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -227,18 +227,16 @@ int main(int argc, char *argv[])
        {
            auto p = blockIdx.x * blockDim.x + threadIdx.x;
-            float a = vd_in.template get<0>(p)[0];
+            vd_out.template get<0>(p) = a;
-            float b = vd_in.template get<0>(p)[1];
-	    vd_out.template get<0>(p) = a + b;
            vd_out.template get<1>(p)[0] = a;
-            vd_out.template get<1>(p)[1] = b;
+            vd_out.template get<1>(p)[1] = a;
            vd_out.template get<2>(p)[0][0] = a;
-            vd_out.template get<2>(p)[0][1] = b;
+            vd_out.template get<2>(p)[0][1] = a;
-            vd_out.template get<2>(p)[1][0] = a + b;
+            vd_out.template get<2>(p)[1][0] = a;
-            vd_out.template get<2>(p)[1][1] = b - a;
+            vd_out.template get<2>(p)[1][1] = a;
+            vd_in.template get<0>(p)[1] = a;
        };
        CUDA_LAUNCH_LAMBDA(ite, lamb);
@@ -275,7 +273,7 @@ int main(int argc, char *argv[])
                                auto p = blockIdx.x * blockDim.x + threadIdx.x;
                                float a = vd_out.template get<0>(p);
                                float b = vd_out.template get<1>(p)[0];
                                float c = vd_out.template get<1>(p)[1];
@@ -284,8 +282,8 @@ int main(int argc, char *argv[])
                                float f = vd_out.template get<2>(p)[1][0];
                                float g = vd_out.template get<2>(p)[1][1];
-                                vd_in.template get<0>(p)[0] = a+b+c+d;
+                                float h = vd_in.template get<0>(p)[0];
-                                vd_in.template get<0>(p)[1] = e+f+g;
+                                vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h;
                            };
        CUDA_LAUNCH_LAMBDA(ite, lamb);

--- a/openfpm_devices @ c4dd3cff
+++ b/openfpm_devices @ c4dd3cff
-Subproject commit 6d8f28cc101c2350975ff1a3e82127319c03e392
+Subproject commit c4dd3cffacb260372766b760bcbcab6815167b73