diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile
index 329fcab0e90a66c497577d5ae2d29dbae2bebb40..68543ae85b33abcd98e407e817bed52d5933e94b 100644
--- a/example/Performance/memBW/Makefile
+++ b/example/Performance/memBW/Makefile
@@ -42,7 +42,7 @@ OBJ = main.o
 memBW:
 
 %.o: %.cu
-	$(CUDA_CC) -g -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
 	$(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index 87550d9847cfc610141b7a94fad92eebd73f5a3f..d7a1be5664aead3304fb5bee060187d322cb7c56 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -53,18 +53,18 @@ __global__ void translate_fill_prop_write_array(float * vd_out_scal,
 {
 	auto p = blockIdx.x * blockDim.x + threadIdx.x;
 
-	float a = vd_in_vec[p* + 0*stride];
-	float b = vd_in_vec[p* + 1*stride];
+	float a = vd_in_vec[p + 0*stride];
 
-	vd_out_scal[p] = a + b;
+	vd_out_scal[p] = a;
 
 	vd_out_vec[p + 0*stride] = a;
-	vd_out_vec[p + 1*stride] = b;
+	vd_out_vec[p + 1*stride] = a;
 
 	vd_out_mat[p + 0*2*stride + 0*stride ] = a;
-	vd_out_mat[p + 0*2*stride + 1*stride ] = b;
-	vd_out_mat[p + 1*2*stride + 0*stride ] = a + b;
-	vd_out_mat[p + 1*2*stride + 1*stride ] = b - a;
+	vd_out_mat[p + 0*2*stride + 1*stride ] = a;
+	vd_out_mat[p + 1*2*stride + 0*stride ] = a;
+	vd_out_mat[p + 1*2*stride + 1*stride ] = a;
+	vd_in_vec[p + 1*stride] = a;
 }
 
 
@@ -184,6 +184,18 @@ int main(int argc, char *argv[])
 
     initialize_buf(in,out);
 
+
+for (int j = 0 ; j < 100 ; j++)
+{
+
+    for (int i = 0 ; i < 16777216; i++)
+    {
+	    out.get<2>(i)[1][0] = in.get<0>(i)[1];
+    }
+}
+
+    return 0;
+
     // Read write test with TLS
 
     auto ite = out.getGPUIterator(256);
@@ -344,6 +356,38 @@ int main(int argc, char *argv[])
     double dev_read_lamb = 0.0;
     standard_deviation(res,mean_read_lamb,dev_read_lamb);
 
+    // Array benchmark
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+	float * out_s = (float *)out.getDeviceBuffer<0>();
+	float * out_v = (float *)out.getDeviceBuffer<1>();
+	float * out_m = (float *)out.getDeviceBuffer<2>();
+	float * in_v = (float *)in.getDeviceBuffer<0>();
+
+        CUDA_LAUNCH(translate_fill_prop_write_array,ite,out_s,out_v,out_m,in_v,out.capacity());
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time ARR: " << t.getwct() << std::endl;
+        std::cout << "BW ARR: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write_arr = 0.0;
+    double dev_write_arr = 0.0;
+    standard_deviation(res,mean_write_arr,dev_write_arr);
+
+    ///////////////////
+
     #ifdef CUDIFY_USE_CUDA
 
     for (int i = 0 ; i < 110 ; i++)
@@ -382,6 +426,7 @@ int main(int argc, char *argv[])
     std::cout << "Average READ with lamb: " << mean_read_lamb << "  deviation: " << dev_read_lamb << std::endl;
     std::cout << "Average WRITE with lamb: " << mean_write_lamb << "  deviation: " << dev_write_lamb << std::endl;
 
+    std::cout << "Average WRITE with array: " << mean_write_arr << "  deviation: " << dev_write_arr << std::endl;
 }
 
 #else
diff --git a/openfpm_devices b/openfpm_devices
index f95b31b6d0af0a3c60a2643840854feba90abb1e..ccdd64f9dee09116426e685ec44debfab255aa0a 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit f95b31b6d0af0a3c60a2643840854feba90abb1e
+Subproject commit ccdd64f9dee09116426e685ec44debfab255aa0a
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index cb5761cf1f5c299ed7623f22116d96f186fb58c3..e2485f2451594448e57e1cba218fce7812888d71 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -5,6 +5,7 @@
 #include <boost/test/unit_test.hpp>
 #include "Grid/grid_dist_id.hpp"
 
+
 BOOST_AUTO_TEST_SUITE( sgrid_gpu_test_suite )
 
 template<unsigned int p>