diff --git a/example/Vector/3_molecular_dynamic_gpu/main.cu b/example/Vector/3_molecular_dynamic_gpu/main.cu
index f0b28e427ca111fa376bfd0a69248d2a1389de40..10d9a63313320df0496db754d8fd9bc1dbccfe99 100644
--- a/example/Vector/3_molecular_dynamic_gpu/main.cu
+++ b/example/Vector/3_molecular_dynamic_gpu/main.cu
@@ -235,7 +235,7 @@ template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggr
 	// Get an iterator over particles
 	auto it2 = vd.getDomainIteratorGPU();
 
-	calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2);
+	CUDA_LAUNCH(calc_force_gpu,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2);
 }
 
 //! \cond [calc_forces] \endcond
@@ -249,7 +249,7 @@ template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_numbe
 
 	auto it2 = vd.getDomainIteratorGPU();
 
-	particle_energy<<<it2.wthr,it2.thr>>>(vd.toKernel(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
+	CUDA_LAUNCH(particle_energy,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
 
 	//! \cond [calc_energy_red] \endcond
 
@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
 		// Get the iterator
 		auto it3 = vd.getDomainIteratorGPU();
 
-		update_velocity_position<<<it3.wthr,it3.thr>>>(vd.toKernel(),dt);
+		CUDA_LAUNCH(update_velocity_position,it3,vd.toKernel(),dt);
 
 		//! \cond [run_on_device] \endcond
 
@@ -360,7 +360,7 @@ int main(int argc, char* argv[])
 		// Integrate the velocity Step 3
 		auto it4 = vd.getDomainIteratorGPU();
 
-		update_velocity<<<it4.wthr,it4.thr>>>(vd.toKernel(),dt);
+		CUDA_LAUNCH(update_velocity,it4,vd.toKernel(),dt);
 
 		// After every iteration collect some statistic about the configuration
 		if (i % 100 == 0)
diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
index 2d22c56cc33b49464389deb92c331cbac26e7fdb..8d20134e5772b9ff99171c246059cf54d45988fe 100644
--- a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
+++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
@@ -262,7 +262,7 @@ template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggr
 
 	//! \cond [calc_force_sorted] \endcond
 
-	calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2);
+	CUDA_LAUNCH(calc_force_gpu,it2.wthr,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2);
 
 	//! \cond [calc_force_sorted] \endcond
 
@@ -282,7 +282,7 @@ template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_numbe
 
 	auto it2 = vd.getDomainIteratorGPU();
 
-	particle_energy<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
+	CUDA_LAUNCH(particle_energy,it2,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
 
 	vd.merge_sort<energy>(NN);
 
@@ -378,7 +378,7 @@ int main(int argc, char* argv[])
 		// Get the iterator
 		auto it3 = vd.getDomainIteratorGPU();
 
-		update_velocity_position<<<it3.wthr,it3.thr>>>(vd.toKernel(),dt);
+		CUDA_LAUNCH(update_velocity_position,it3,vd.toKernel(),dt);
 
 		// Because we moved the particles in space we have to map them and re-sync the ghost
 		vd.map(RUN_ON_DEVICE);
@@ -390,7 +390,7 @@ int main(int argc, char* argv[])
 		// Integrate the velocity Step 3
 		auto it4 = vd.getDomainIteratorGPU();
 
-		update_velocity<<<it4.wthr,it4.thr>>>(vd.toKernel(),dt);
+		CUDA_LAUNCH(update_velocity,it4,vd.toKernel(),dt);
 
 		// After every iteration collect some statistic about the configuration
 		if (i % 1000 == 0)
diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
index 4535301905776407eb225b9398c7240a1a1b4b0c..7e7eab9d99f8c2b10a2aa04e124e098e5cbe0891 100644
--- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
@@ -531,7 +531,7 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 	// Calculate the maximum acceleration
 	auto part = vd.getDomainIteratorGPU();
 
-	max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
+	CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
 
 	max_acc = reduce_local<red,_max_>(vd);
 	max_vel = reduce_local<red2,_max_>(vd);
@@ -714,7 +714,7 @@ void euler_int(particles & vd, real_number dt)
 
 	real_number dt205 = dt*dt*0.5;
 
-	euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205);
+	CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205);
 
 	// remove the particles
 	remove_marked<red>(vd);
diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp
index f31f4538a454f85851b1a2c221a033e8190b6dfe..8dc777dba8a2fe8ffbe68d49d5a9a10c6ec4dba1 100644
--- a/src/Vector/vector_dist_comm.hpp
+++ b/src/Vector/vector_dist_comm.hpp
@@ -1269,42 +1269,6 @@ class vector_dist_comm
 
 			// The first part of m_opart and prc_sz contain the local particles
 
-			#ifndef TEST1
-
-			v_pos_tmp.resize(prc_sz.template get<0>(0));
-			v_prp_tmp.resize(prc_sz.template get<0>(0));
-
-			auto ite = v_pos_tmp.getGPUIterator();
-
-			// fill v_pos_tmp and v_prp_tmp with local particles
-			process_map_particles<decltype(m_opart.toKernel()),decltype(v_pos_tmp.toKernel()),decltype(v_prp_tmp.toKernel()),
-					                                           decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
-			<<<ite.wthr,ite.thr>>>
-			(m_opart.toKernel(),v_pos_tmp.toKernel(), v_prp_tmp.toKernel(),
-					            v_pos.toKernel(),v_prp.toKernel(),0);
-
-			size_t offset = prc_sz.template get<0>(0);
-
-			// Fill the sending buffers
-			for (size_t i = 0 ; i < m_pos.size() ; i++)
-			{
-				auto ite = m_pos.get(i).getGPUIterator();
-
-				process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
-						                                           decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
-				<<<ite.wthr,ite.thr>>>
-				(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
-						            v_pos.toKernel(),v_prp.toKernel(),offset);
-
-				offset += prc_sz_r.size();
-			}
-
-			// old local particles with the actual local particles
-			v_pos_tmp.swap(v_pos);
-			v_prp_tmp.swap(v_prp);
-
-			#else
-
 			int rank = v_cl.rank();
 
 			v_pos_tmp.resize(prc_sz.template get<0>(rank));
@@ -1350,7 +1314,6 @@ class vector_dist_comm
 			v_pos_tmp.swap(v_pos);
 			v_prp_tmp.swap(v_prp);
 
-			#endif
 #else
 
 			std::cout << __FILE__ << ":" << __LINE__ << " error RUN_ON_DEVICE require that you compile with NVCC, but it seem compiled with a normal compiler" << std::endl;
@@ -1478,35 +1441,6 @@ class vector_dist_comm
 			ite,
 			dec.toKernel(),v_pos.toKernel(),lbl_p.toKernel(),prc_sz.toKernel(),v_cl.rank());
 
-
-			#ifndef TEST1
-
-			// sort particles
-			mergesort((int *)lbl_p.template getDeviceBuffer<1>(),(int *)lbl_p.template getDeviceBuffer<0>(), lbl_p.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
-
-			mem.allocate(sizeof(int));
-			mem.fill(0);
-
-			// Find the buffer bases
-			find_buffer_offsets<1,decltype(lbl_p.toKernel()),decltype(prc_sz.toKernel())><<<ite.wthr,ite.thr>>>
-					           (lbl_p.toKernel(),(int *)mem.getDevicePointer(),prc_sz.toKernel());
-
-#error "should not be here"
-
-			// Trasfer the number of offsets on CPU
-			mem.deviceToHost();
-			prc_sz.template deviceToHost<0,1>();
-			// get also the last element from lbl_p;
-			lbl_p.template deviceToHost<1>(lbl_p.size()-1,lbl_p.size()-1);
-
-			mem.deviceToHost();
-			int noff = *(int *)mem.getPointer();
-			prc_sz.resize(noff+1);
-			prc_sz.template get<0>(prc_sz.size()-1) = lbl_p.size();
-			prc_sz.template get<1>(prc_sz.size()-1) = lbl_p.template get<1>(lbl_p.size()-1);
-
-			#else
-
 			starts.resize(v_cl.size());
 			openfpm::scan((unsigned int *)prc_sz.template getDeviceBuffer<0>(), prc_sz.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
 
@@ -1518,7 +1452,6 @@ class vector_dist_comm
 			// we order lbl_p
 			CUDA_LAUNCH((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite,lbl_p.toKernel(),starts.toKernel());
 
-			#endif
 
 #else