diff --git a/example/Performance/miniBUDE/Makefile b/example/Performance/miniBUDE/Makefile
index 7a1ecea66f950659d8f0c1b0973846e577dca282..f08386940b49300f35953c4a857f5ea7e8480048 100644
--- a/example/Performance/miniBUDE/Makefile
+++ b/example/Performance/miniBUDE/Makefile
@@ -4,68 +4,62 @@ include ../../example.mk
 ### internally the example disable with the preprocessor its code if not compiled with nvcc 
 CUDA_CC=
 CUDA_CC_LINK=
+
+CC=mpic++
 ifdef HIP
         CUDA_CC=hipcc
-        CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+        CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
         LIBS_SELECT=$(LIBS)
         CC=hipcc
-        CUDA_CC_LINK=hipcc
+	CUDA_CC_LINK=hipcc
 else
 	ifdef CUDA_ON_CPU
         	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         	INCLUDE_PATH_NVCC=
         	CUDA_CC_LINK=mpic++
-		CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000 -fopenmp -O3 -ffast-math -march=native -mavx
-		LIBS_SELECT=$(LIBS)
+        	CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
+        	LIBS_SELECT=$(LIBS)
 	else
         	ifeq (, $(shell which nvcc))
                 	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
                 	INCLUDE_PATH_NVCC=
                 	CUDA_CC_LINK=mpic++
-			CUDA_OPTIONS=
 			LIBS_SELECT=$(LIBS)
         	else
                 	CUDA_CC=nvcc -ccbin=mpic++
                 	CUDA_CC_LINK=nvcc -ccbin=mpic++
-			CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
 			LIBS_SELECT=$(LIBS_NVCC)
         	endif
 	endif
 endif
 
-
-ifeq ($(PROFILE),ON)
-        CUDA_CC=scorep --nocompiler  --cuda --mpp=mpi nvcc -ccbin=mpic++
-        CUDA_CC_LINK=scorep --nocompiler  --cuda --mpp=mpi nvcc -ccbin=mpic++
-else
-	CUDA_CC:=$(CUDA_CC)
-	CUDA_CC_LINK:=$(CUDA_CC_LINK)
-endif
+CC=mpic++
 
 LDIR =
 OPT=
 
 OBJ = main.o
 
-miniBUDE:
+sph_dlb:
+sph_dlb_test: OPT += -DTEST_RUN
+sph_dlb_test: sph_dlb
 
 %.o: %.cu
-	$(CUDA_CC) $(OPT) $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) $(CUDA_OPTIONS) $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
-	$(CC) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
-
-miniBUDE: $(OBJ)
-	$(CUDA_CC_LINK) -o  $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
+	$(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
 
+sph_dlb: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
 
-all: miniBUDE
+all: sph_dlb
 
-run: miniBUDE
-	mpirun --oversubscribe -np 2 ./miniBUDE
+run: sph_dlb_test
+	mpirun --oversubscribe -np 2 ./sph_dlb
 
 .PHONY: clean all run
 
 clean:
-	rm -f *.o *~ core miniBUDE
+	rm -f *.o *~ core sph_dlb
 
diff --git a/example/Performance/miniBUDE/config.cfg b/example/Performance/miniBUDE/config.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc
--- /dev/null
+++ b/example/Performance/miniBUDE/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cu Makefile
diff --git a/example/Performance/miniBUDE/main.cu b/example/Performance/miniBUDE/main.cu
index 5ec7ddb6b55ed70388eb8975f613c3a5a35ec783..c092e680a9910efb9e5a77ba9a8dc3d70af03469 100644
--- a/example/Performance/miniBUDE/main.cu
+++ b/example/Performance/miniBUDE/main.cu
@@ -75,7 +75,7 @@ double getTimestamp()
     return tv.tv_usec + tv.tv_sec*1e6;
 }
 
-void printTimings(double start, double end, double poses_per_wi)
+void printTimings(double start, double end, double poses_per_wi, openfpm::vector<double> & gflops_data)
 {
     double ms = ((end-start)/params.iterations)*1e-3;
 
@@ -88,6 +88,8 @@ void printTimings(double start, double end, double poses_per_wi)
     double flops      = total_ops / runtime;
     double gflops     = flops / 1e9;
 
+    gflops_data.add(gflops);
+
     double interactions         =
         (double)params.nposes
         * (double)params.natlig
@@ -608,41 +610,45 @@ int main(int argc, char *argv[])
   printf("Deck      : %s\n", params.deckDir);
   float *resultsRef = (float *)malloc(params.nposes*sizeof(float));
 
-  runCUDA(_openfpm);
+  // We run the benchmark 30 times to get mean and variace
+  for (int i = 0 ; i < 30 ; i++)
+  {
 
+    runCUDA(_openfpm);
 
-  // Load reference results from file
-  FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES, "r", NULL);
-  size_t n_ref_poses = params.nposes;
-  if (params.nposes > REF_NPOSES) {
-    printf("Only validating the first %d poses.\n", REF_NPOSES);
-    n_ref_poses = REF_NPOSES;
-  }
 
-  for (size_t i = 0; i < n_ref_poses; i++)
-    fscanf(ref_energies, "%f", &resultsRef[i]);
+    // Load reference results from file
+    FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES, "r", NULL);
+    size_t n_ref_poses = params.nposes;
+    if (params.nposes > REF_NPOSES) {
+      printf("Only validating the first %d poses.\n", REF_NPOSES);
+      n_ref_poses = REF_NPOSES;
+    }
 
-  fclose(ref_energies);
+    for (size_t i = 0; i < n_ref_poses; i++)
+      fscanf(ref_energies, "%f", &resultsRef[i]);
 
-  float maxdiff = -100.0f;
-  printf("\n Reference        CUDA   (diff)\n");
-  for (int i = 0; i < n_ref_poses; i++)
-  {
-    if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f) continue;
+    fclose(ref_energies);
 
-    float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i);
-    if (diff > maxdiff) {
-      maxdiff = diff;
-      // printf ("Maxdiff: %.2f (%.3f vs %.3f)\n", maxdiff, resultsRef[i], resultsCUDA[i]);
-    }
+    float maxdiff = -100.0f;
+    printf("\n Reference        CUDA   (diff)\n");
+    for (int i = 0; i < n_ref_poses; i++)
+    {
+      if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f) continue;
 
-    if (i < 8)
-      printf("%7.2f    vs   %7.2f  (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff);
-  }
-  printf("\nLargest difference was %.3f%%\n\n", maxdiff*100);
+      float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i);
+      if (diff > maxdiff) {
+        maxdiff = diff;
+        // printf ("Maxdiff: %.2f (%.3f vs %.3f)\n", maxdiff, resultsRef[i], resultsCUDA[i]);
+      }
 
-  free(resultsRef);
+      if (i < 8)
+        printf("%7.2f    vs   %7.2f  (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff);
+    }
+    printf("\nLargest difference was %.3f%%\n\n", maxdiff*100);
 
+    free(resultsRef);
+  }
 }