diff --git a/example/Vector/7_SPH_dlb_gpu_opt/Makefile b/example/Vector/7_SPH_dlb_gpu_opt/Makefile
index ce1336a5d73aa5906577a50b5c12e4a8e8f9a76d..cca7077fdc1c3c4229ec9ad16db9200ea740a8f1 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/Makefile
+++ b/example/Vector/7_SPH_dlb_gpu_opt/Makefile
@@ -12,10 +12,10 @@ sph_dlb_test: OPT += -DTEST_RUN
 sph_dlb_test: sph_dlb
 
 %.o: %.cu
-	nvcc -O3 -g -c -isystem=/home/i-bird/MPI/include --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
+	nvcc -O0 -g -c -isystem=/home/i-bird/MPI/include --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
 
 %.o: %.cpp
-	$(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
+	$(CC) -O0 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH)
 
 sph_dlb: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS)
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index b0177e4612e5b1ee5f98fe17c8acbe0cfbd54ea6..ba92a1a6cead3018a089ffccf9f514c52ca809b3 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -390,7 +390,7 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 
 	vd.merge_sort<force,drho,red>(NN);
 
-	max_visc = reduce<red,_max_>(vd);
+	max_visc = reduce_local<red,_max_>(vd);
 }
 
 template<typename vector_type>
@@ -412,8 +412,8 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 
 	max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
 
-	max_acc = reduce<red,_max_>(vd);
-	max_vel = reduce<red2,_max_>(vd);
+	max_acc = reduce_local<red,_max_>(vd);
+	max_vel = reduce_local<red2,_max_>(vd);
 
 	Vcluster<> & v_cl = create_vcluster();
 	v_cl.max(max_acc);
diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu
index d78b4a294d7d3f6b7f59421893a2e4db9ecd213d..63bf0b0aa7ffb5cf967e3483292586dac15d64a1 100644
--- a/src/Vector/cuda/vector_dist_cuda_func_test.cu
+++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu
@@ -1094,10 +1094,28 @@ void vector_dist_remove_marked_type()
 	// Boundary conditions
 	size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
 
-	vector_dist_gpu<3,float,aggregate<float,float,int,int>> vd(5000*v_cl.size(),domain,bc,g);
+	vector_dist_gpu<3,float,aggregate<float,float,int,int>> vd(50000*v_cl.size(),domain,bc,g);
+
+	// Fill the position
 
 	auto it = vd.getDomainIterator();
 
+	while(it.isNext())
+	{
+		auto p = it.get();
+
+		vd.getPos(p)[0] = (float)rand() / RAND_MAX;
+		vd.getPos(p)[1] = (float)rand() / RAND_MAX;
+		vd.getPos(p)[2] = (float)rand() / RAND_MAX;
+
+		++it;
+	}
+
+	vd.map();
+	vd.template ghost_get<>();
+
+	it = vd.getDomainIterator();
+
 	float fc = 1.0;
 	float dc = 1.0;
 	int ic = 1;
diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
index 76a52e4805e194cab4b6143ac262598397c278ae..3e119645e6e1aa4564cf0f8212e94c74b5e599d5 100644
--- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh
+++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh
@@ -104,11 +104,11 @@ __global__  void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
 }
 
 template<unsigned int prp_off, typename vector_type,typename vector_type_offs>
-__global__  void find_buffer_offsets_no_prc(vector_type vd, int * cnt, vector_type_offs offs)
+__global__  void find_buffer_offsets_no_prc(vector_type vd, int * cnt, vector_type_offs offs, int g_m)
 {
     int p = threadIdx.x + blockIdx.x * blockDim.x;
 
-    if (p >= (int)vd.size() - 1) return;
+    if (p >= (int)g_m - 1) return;
 
     if (vd.template get<prp_off>(p) != vd.template get<prp_off>(p+1))
 	{
@@ -367,12 +367,12 @@ void remove_marked(vector_type & vd)
 
 	// mark point, particle that stay and to remove
 	find_buffer_offsets_no_prc<prp,decltype(vd.getPropVector().toKernel()),decltype(mark.toKernel())><<<ite.wthr,ite.thr>>>
-			           (vd.getPropVector().toKernel(),(int *)mem.getDevicePointer(),mark.toKernel());
+			           (vd.getPropVector().toKernel(),(int *)mem.getDevicePointer(),mark.toKernel(),vd.size_local());
 
 	mem.deviceToHost();
 
 	// we have no particles to remove
-	if (*(int *)mem.getPointer() == 0)
+	if (*(int *)mem.getPointer() != 1)
 	{return;}
 
 	// Get the mark point