diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index 2ea20b24ddd9bfba136bf3fb286b37aa9bd9a22e..0738bcf4f463785ef505655633888a8156b6a5d2 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -5,7 +5,7 @@
 
 //! Memory bandwidth with small calculations
 template<typename vector_type, typename vector_type2>
-inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
+__global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
 {
 	auto p = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -25,7 +25,7 @@ inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type
 
 
 template<typename vector_type, typename vector_type2>
-inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
+__global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
 {
 	auto p = blockIdx.x * blockDim.x + threadIdx.x;