diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile
index a5198f1860831028b6b5edcf6332dd03f03a0e51..c44bd35ded8d5919b5fbb70d5064a79054a71872 100644
--- a/example/Performance/memBW/Makefile
+++ b/example/Performance/memBW/Makefile
@@ -47,12 +47,12 @@ miniBUDE:
 %.o: %.cpp
 	$(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
-miniBUDE: $(OBJ)
+memBW: $(OBJ)
 	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
-all: miniBUDE
+all: memBW
 
-run: miniBUDE
+run: memBW
 	mpirun --oversubscribe -np 2 ./miniBUDE
 
 .PHONY: clean all run
diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
index dd08ed890ae6ba734af24dc549f6697ed19728a8..07c516f7e2b08eaa5035ffae4c4d640a13cf1699 100644
--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -3,7 +3,7 @@
 
 
 template<typename vector_type, typename vector_type2>
-__attribute__((always_inline)) inline __global__ void translate_fill_prop(vector_type & vd_out, vector_type2 & vd_in)
+inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_in)
 {
 	auto p = blockIdx.x * blockDim.x + threadIdx.x;