diff --git a/example/Vector/7_SPH_dlb_gpu_opt/Makefile b/example/Vector/7_SPH_dlb_gpu_opt/Makefile
index dc1e101e100bf680bf39213e620bc199b6d5bc0a..24d51560f8247cbc23768c84358db11d9d91a5c1 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/Makefile
+++ b/example/Vector/7_SPH_dlb_gpu_opt/Makefile
@@ -8,7 +8,7 @@ ifdef CUDA_ON_CPU
         CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
         INCLUDE_PATH_NVCC=
         CUDA_CC_LINK=mpic++
-	CUDA_OPTIONS=
+	CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000
 else
         ifeq (, $(shell which nvcc))
                 CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index 17f6f7f3d70a6113a315b34f6e333085798fe91c..140ca52b6fc60b2a16485c77024c6d15560775c9 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -51,7 +51,6 @@
 #include "Vector/vector_dist.hpp"
 #include <math.h>
 #include "Draw/DrawParticles.hpp"
-#include <cuda_profiler_api.h>
 
 
 
@@ -313,19 +312,19 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	Point<3,real_number> xa = vd.getPos(a);
 
 	// Type of the particle
-	unsigned int typea = vd.getProp<type>(a);
+	unsigned int typea = vd.template getProp<type>(a);
 
 	// Take the mass of the particle dependently if it is FLUID or BOUNDARY
 	//real_number massa = (typea == FLUID)?MassFluid:MassBound;
 
 	// Get the density of the of the particle a
-	real_number rhoa = vd.getProp<rho>(a);
+	real_number rhoa = vd.template getProp<rho>(a);
 
 	// Get the pressure of the particle a
-	real_number Pa = vd.getProp<Pressure>(a);
+	real_number Pa = vd.template getProp<Pressure>(a);
 
 	// Get the Velocity of the particle a
-	Point<3,real_number> va = vd.getProp<velocity>(a);
+	Point<3,real_number> va = vd.template getProp<velocity>(a);
 
 	Point<3,real_number> force_;
 	force_.get(0) = 0.0f;
@@ -348,12 +347,12 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 		// if (p == q) skip this particle this condition should be done in the r^2 = 0
 		if (a == b)	{++Np; continue;};
 
-        unsigned int typeb = vd.getProp<type>(b);
+        unsigned int typeb = vd.template getProp<type>(b);
 
         real_number massb = (typeb == FLUID)?MassFluid:MassBound;
-        Point<3,real_number> vb = vd.getProp<velocity>(b);
-        real_number Pb = vd.getProp<Pressure>(b);
-        real_number rhob = vd.getProp<rho>(b);
+        Point<3,real_number> vb = vd.template getProp<velocity>(b);
+        real_number Pb = vd.template getProp<Pressure>(b);
+        real_number rhob = vd.template getProp<rho>(b);
 
 		// Get the distance between p and q
 		Point<3,real_number> dr = xa - xb;
@@ -387,7 +386,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 		++Np;
 	}
 
-	vd.getProp<red>(a) = max_visc;
+	vd.template getProp<red>(a) = max_visc;
 
 	vd.template getProp<force>(a)[0] = force_.get(0);
 	vd.template getProp<force>(a)[1] = force_.get(1);
@@ -414,11 +413,11 @@ __global__ void max_acceleration_and_velocity_gpu(vector_type vd)
 {
 	auto a = GET_PARTICLE(vd);
 
-	Point<3,real_number> acc(vd.getProp<force>(a));
-	vd.getProp<red>(a) = norm(acc);
+	Point<3,real_number> acc(vd.template getProp<force>(a));
+	vd.template getProp<red>(a) = norm(acc);
 
-	Point<3,real_number> vel(vd.getProp<velocity>(a));
-	vd.getProp<red2>(a) = norm(vel);
+	Point<3,real_number> vel(vd.template getProp<velocity>(a));
+	vd.template getProp<red2>(a) = norm(vel);
 }
 
 void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_number & max_vel)
@@ -426,7 +425,7 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 	// Calculate the maximum acceleration
 	auto part = vd.getDomainIteratorGPU();
 
-	max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
 
 	max_acc = reduce_local<red,_max_>(vd);
 	max_vel = reduce_local<red2,_max_>(vd);
@@ -529,7 +528,7 @@ void verlet_int(particles & vd, real_number dt)
 	real_number dt205 = dt*dt*0.5;
 	real_number dt2 = dt*2.0;
 
-	verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
+	CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205);
 
 	// remove the particles marked
 	remove_marked<red>(vd);
@@ -605,7 +604,7 @@ void euler_int(particles & vd, real_number dt)
 
 	real_number dt205 = dt*dt*0.5;
 
-	euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205);
 
 	// remove the particles
 	remove_marked<red>(vd);
@@ -681,7 +680,7 @@ inline void sensor_pressure(Vector & vd,
         // if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			sensor_pressure_gpu<<<1,1>>>(vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
 
 			vd.merge<Pressure>(NN);
 
@@ -720,7 +719,9 @@ int main(int argc, char* argv[])
     // initialize the library
 	openfpm_init(&argc,&argv);
 
+#ifndef CUDA_ON_CPU
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+#endif
 
 	// It contain for each time-step the value detected by the probes
 	openfpm::vector<openfpm::vector<real_number>> press_t;
diff --git a/install b/install
index 27a6a8d7192eebdb3973c91553ae3f2ee94d177b..88d0da947c9807f008ece653815ae1469c38e078 100755
--- a/install
+++ b/install
@@ -433,10 +433,9 @@ if [ x"$cuda_on_cpu" == x"YES" ]; then
 fi
 if [ x"$gpu_support" == x"1" ]; then
     echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk
-    echo "LIBS_SE2=-lvcluster -lofpmmemory_se2 -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -lboost_filesystem -lboost_system" >> example.mk
+    echo "LIBS_CUDA_ON_CPU=-lvcluster_cuda_on_cpu -lofpmmemory_cuda_on_cpu -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $(cat cuda_lib) $lin_alg_lib -lboost_filesystem -lboost_system" >> example.mk
 else
     echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $lin_alg_lib -ldl -lboost_filesystem -lboost_system $optional_boost" >> example.mk
-    echo "LIBS_SE2=-lvcluster -lofpmmemory_se2 -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc  $lin_alg_lib -lboost_filesystem -lboost_system $optional_boost" >> example.mk
 fi
 echo "INCLUDE_PATH_NVCC=-Xcompiler="-Wno-deprecated-declarations" $(cat openmp_flags) "$(cat cuda_options)" -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include  $lin_alg_inc" >> example.mk
 cp example.mk src/example.mk