diff --git a/openfpm_data b/openfpm_data
index 5e00d1f0f7fbbd4e4649c41284dc83d3df393222..e91db651a61479ae33ef2a66c3cfce70315e16b6 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 5e00d1f0f7fbbd4e4649c41284dc83d3df393222
+Subproject commit e91db651a61479ae33ef2a66c3cfce70315e16b6
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 018f8d6d34e738313c01915237e49250c7271ae2..189910a18f1094a5213a283fd1a9901ab97b135d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 
+add_definitions(-DSCAN_WITH_CUB)
 
 ########################### Executables
 
diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
index 1ae16b0d563a60a19de853058f069cc9af3723f5..1809b44a607f5c68effe72221576a0972d2c88e8 100644
--- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh
@@ -92,7 +92,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
 			// scan
 			//sc.scan_(proc_id_out,starts);
 			starts.resize(proc_id_out.size());
-			mgpu::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
+			openfpm::scan((unsigned int *)proc_id_out.template getDeviceBuffer<0>(), proc_id_out.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
 			starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
 			size_t sz = starts.template get<0>(starts.size()-1);
 
@@ -265,7 +265,7 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
 		box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
 
 		starts.resize(o_part_loc.size());
-		mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
+		openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
 
 		starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
 		size_t total = starts.template get<0>(starts.size()-1);
diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu
index 4088971a30f670fbf5fdf2be09261b6351433db9..e50a9c46792449e88d97b607eb1fffa6d6e97134 100644
--- a/src/Vector/cuda/vector_dist_cuda_func_test.cu
+++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu
@@ -129,7 +129,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
 	starts.resize(o_part_loc.size());
 
 	auto & v_cl = create_vcluster();
-	mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
+	openfpm::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
 
 	starts.deviceToHost<0>(starts.size()-1,starts.size()-1);
 	size_t tot = starts.template get<0>(o_part_loc.size()-1);
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index 6694ac553e107c2ca61861a46e561800acfad3fe..b08cf1b11a97a19643ffe8ce0d07eb36312ca94f 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -1473,7 +1473,7 @@ class vector_dist_comm
 			#else
 
 			starts.resize(v_cl.size());
-			mgpu::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
+			openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
 
 			// move prc_sz to host
 			prc_sz.template deviceToHost<0>();