diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index b38b643bf1d40e75e154568935c5325311068953..2cc424229cb2c1be4981ed8a23de9474ae837e5b 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -143,7 +143,7 @@ void check_read(in_type & in, out_type & out)
 
         if (success == false)
         {
-            std::cout << "FAIL READ " << i << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
+            std::cout << "FAIL READ " << i << "   " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl;
             exit(1);
         }
     }
@@ -202,6 +202,12 @@ int main(int argc, char *argv[])
 
     check_write(in,out);
 
+    for (int i = 0 ; i < 16777216 ; i++)
+    {
+        in.template get<0>(i)[0] = i;
+        in.template get<0>(i)[1] = i+100.0;
+    }
+
     for (int i = 0 ; i < 110 ; i++)
     {
         cudaDeviceSynchronize();
@@ -277,6 +283,12 @@ int main(int argc, char *argv[])
     double dev_write_lamb = 0.0;
     standard_deviation(res,mean_write_lamb,dev_write_lamb);
 
+    for (int i = 0 ; i < 16777216 ; i++)
+    {
+        in.template get<0>(i)[0] = i;
+        in.template get<0>(i)[1] = i+100.0;
+    }
+
     for (int i = 0 ; i < 110 ; i++)
     {
         cudaDeviceSynchronize();