diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile
index b92942d4611f3563c1ba1850d53cfea737c1a87a..9488bea01f7cb9eb49d5570ea692d8091ff9cfcb 100644
--- a/example/Grid/3_gray_scott_3d_vectorization/Makefile
+++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile
@@ -10,7 +10,7 @@ OBJ = main.o update_new.o
 	mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore  -fimplicit-none  -mavx -O3 -c -g -o $@ $<
 
 %.o: %.cpp
-	$(CC) -O0 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH) -I/home/i-bird/VC/include
+	$(CC) -O3 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH) -I/home/i-bird/VC/include
 
 gray_scott: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -L/home/i-bird/VC/lib  -lVc
diff --git a/example/Grid/3_gray_scott_3d_vectorization/main.cpp b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
index db8f5ea04780b615b31b21e3cadbd28ef6d806aa..f153be0dcdf63e7403b9570a72f96d68317c532c 100644
--- a/example/Grid/3_gray_scott_3d_vectorization/main.cpp
+++ b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
@@ -120,7 +120,7 @@ void step(grid_dist_id<3, double, aggregate<double>> & OldU,
 
 			auto & U_new = GET_GRID_M(NewU);
 			auto & V_new = GET_GRID_M(NewV);
-	ITERATE_3D_M
+	ITERATE_3D_M(Vc::double_v::Size)
 
 			// center point
 			auto Cp = it.getStencil<0>();
@@ -224,7 +224,7 @@ int main(int argc, char* argv[])
 	Box<3,double> domain({0.0,0.0},{2.5,2.5,2.5});
 	
 	// grid size
-        size_t sz[3] = {128,128,128};
+        size_t sz[3] = {256,256,256};
 
 	// Define periodicity of the grid
 	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
diff --git a/openfpm_numerics b/openfpm_numerics
index a8e821cff01dd39434b7e32495dbbe4b4544e013..7b9aff6deec78f5db1be1035a7c253927ea477bf 160000
--- a/openfpm_numerics
+++ b/openfpm_numerics
@@ -1 +1 @@
-Subproject commit a8e821cff01dd39434b7e32495dbbe4b4544e013
+Subproject commit 7b9aff6deec78f5db1be1035a7c253927ea477bf
diff --git a/src/Grid/Iterators/grid_dist_id_iterator.hpp b/src/Grid/Iterators/grid_dist_id_iterator.hpp
index 612477712a0368ae3fca5c561a15c6f83b343e26..60cb2e0c4d3573cc86a0ce95b8995a09225b93c8 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator.hpp
@@ -71,8 +71,6 @@ class grid_dist_iterator<dim,device_grid,FREE,stencil>
 		while (g_c < gList.size() && (gList.get(g_c).size() == 0 || gdb_ext.get(g_c).Dbox.isValid() == false ) )
 		{g_c++;}
 
-		dg = &gList.get(g_c);
-
 		// get the next grid iterator
 		if (g_c < gList.size())
 		{
diff --git a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
index e4e86b41b53af67d4bab16f7ac6da48a6856f9cf..d0742e1118adce6bcac476e47193f6cf3ab00871 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
@@ -302,20 +302,21 @@ class grid_dist_iterator_sub
 										int sx = uhi[0]+1;\
 										int sxsy = (uhi[0]+1)*(uhi[1]+1);
 
-#define ITERATE_3D_M			int i = lo[2];\
-								for ( ; i <= hi[2] ; i+=1)\
-								{\
-									int j = lo[1];\
-									for ( ; j <= hi[1] ; j+=1)\
+#define ITERATE_3D_M(n_pt)				int i = lo[2];\
+									for ( ; i <= hi[2] ; i+=1)\
 									{\
-										int k = lo[0];\
-										for ( ; k <= hi[0] ; k+=Vc::double_v::Size)\
-										{
+										int j = lo[1];\
+										for ( ; j <= hi[1] ; j+=1)\
+										{\
+											int k = lo[0];\
+											for ( ; k <= hi[0] ; k+=n_pt)\
+											{
+
 
 #define GET_GRID_M(grid)	grid.get_loc_grid(s);
 
 
-#define END_LOOP_M 					it.private_sum<Vc::double_v::Size>();\
+#define END_LOOP_M(n_pt) 					it.private_sum<n_pt>();\
 								}\
 								it.private_adjust( - k + sx + lo[0]);\
 							}\
diff --git a/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp b/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
index a1ffcb613b6ef23fd04cb95c2c475e19fff7fa6f..0c62f40e589ca8ca8555a16be9d72ff83ffc4440 100644
--- a/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterators_unit_tests.hpp
@@ -320,7 +320,7 @@ void Test3D_stencil(const Box<3,float> & domain, long int k)
 			Ghost<3,long int> g(1);
 
 			// Distributed grid with id decomposition
-			grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+			grid_dist_id<3, float, aggregate<long int, long int, double>, CartDecomposition<3,float>> g_dist(sz,domain,g);
 
 			// fill the grid with values
 
@@ -382,6 +382,97 @@ void Test3D_stencil(const Box<3,float> & domain, long int k)
 	}
 }
 
+void Test3D_fast_vect(const Box<3,float> & domain, long int k)
+{
+	grid_key_dx<3> star_stencil_3D[7] = {{0,0,0},
+	                                         {0,0,-1},
+											 {0,0,1},
+											 {0,-1,0},
+											 {0,1,0},
+											 {-1,0,0},
+											 {1,0,0}};
+
+	{
+		Vcluster & v_cl = create_vcluster();
+
+		if ( v_cl.getProcessingUnits() > 32 )
+			return;
+
+		long int big_step = k / 30;
+		big_step = (big_step == 0)?1:big_step;
+		long int small_step = 21;
+
+		print_test( "Testing grid 3D fast stencil k<=",k);
+
+		// 3D test
+		for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
+		{
+			BOOST_TEST_CHECKPOINT( "Testing grid skin iterator from decomposition k<=" << k );
+
+			// grid size
+			size_t sz[3];
+			sz[0] = k;
+			sz[1] = k;
+			sz[2] = k;
+
+			if (k <= 9)
+				continue;
+
+			Ghost<3,long int> g(1);
+
+			// Distributed grid with id decomposition
+			grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+
+			// fill the grid with values
+
+			auto it = g_dist.getDomainGhostIterator();
+
+			while (it.isNext())
+			{
+				auto p = it.get();
+				auto gkey = it.getGKey(p);
+
+				g_dist.template get<0>(p) = gkey.get(0)*gkey.get(0) + gkey.get(1)*gkey.get(1) + gkey.get(2)*gkey.get(2);
+
+				++it;
+			}
+
+			g_dist.ghost_get<0>();
+
+			size_t ret = true;
+
+			WHILE_M(g_dist,star_stencil_3D)
+				auto & gstl = GET_GRID_M(g_dist);
+			ITERATE_3D_M(1)
+				// center point
+				auto Cp = it.getStencil<0>();
+
+				// plus,minus X,Y,Z
+				auto mx = it.getStencil<1>();
+				auto px = it.getStencil<2>();
+				auto my = it.getStencil<3>();
+				auto py = it.getStencil<4>();
+				auto mz = it.getStencil<5>();
+				auto pz = it.getStencil<6>();
+
+				long int sum = -6*gstl.template get<0>(Cp) +
+						     gstl.template get<0>(mx) +
+							 gstl.template get<0>(px) +
+							 gstl.template get<0>(my) +
+							 gstl.template get<0>(py) +
+							 gstl.template get<0>(mz) +
+							 gstl.template get<0>(pz);
+
+				ret &= (sum == 6);
+
+			END_LOOP_M(1)
+
+			BOOST_REQUIRE_EQUAL(ret,true);
+		}
+
+	}
+}
+
 // Test decomposition grid iterator
 
 void Test3D_decskinit(const Box<3,float> & domain, long int k)
@@ -517,6 +608,16 @@ BOOST_AUTO_TEST_CASE( grid_dist_it_iterators_skin_test )
 	Test3D_decskinit(domain3,k);
 }
 
+BOOST_AUTO_TEST_CASE( grid_dist_it_iterators_3D_fast )
+{
+	// Domain
+	Box<3,float> domain3({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	size_t k = 128*128*128*create_vcluster().getProcessingUnits();
+	k = std::pow(k, 1/3.);
+	Test3D_fast_vect(domain3,k);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif /* SRC_GRID_ITERATORS_GRID_DIST_ID_ITERATORS_UNIT_TESTS_HPP_ */
diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp
index da94e89a1ce970dd92dd77d991b35735a0619d61..fe1c9ac8933ef974c9506b5243000f59028c1c85 100644
--- a/src/Grid/grid_dist_id_comm.hpp
+++ b/src/Grid/grid_dist_id_comm.hpp
@@ -247,18 +247,27 @@ class grid_dist_id_comm
 				if (bx_dst.isValid() == false)
 					continue;
 
+				const auto & gs = loc_grid.get(i);
+				auto & gd = loc_grid.get(sub_id_dst);
+
 #ifdef SE_CLASS1
 
 				if (loc_eg_box.get(sub_id_dst).bid.get(k).sub != i)
-					std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination are not correctly linked" << "\n";
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination are not correctly linked" << "\n";}
 
-				if (sub_src.getVolume() != sub_dst.getVolume())
-					std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination does not match in size" << "\n";
+				if (bx_src.getVolumeKey() != bx_dst.getVolumeKey())
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " source and destination does not match in size" << "\n";}
 
-#endif
+				auto bxs = gs.getGrid().getBoxKey();
+				auto bxd = gd.getGrid().getBoxKey();
 
-				const auto & gs = loc_grid.get(i);
-				auto & gd = loc_grid.get(sub_id_dst);
+				if (bxs.isContained(bx_src) == false)
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " the source box is out of bound of the local grid" << "\n";}
+
+				if (bxd.isContained(bx_dst) == false)
+				{std::cerr << "Error " << __FILE__ << ":" << __LINE__ << " the destination box is out of bound of the local grid" << "\n";}
+
+#endif
 
 				typedef typename std::remove_reference<decltype(gd)>::type grid_cp;
 				typedef typename std::remove_reference<decltype(loc_grid.get(i).getGrid())>::type grid_info_cp;
diff --git a/src/Grid/grid_dist_id_unit_test.cpp b/src/Grid/grid_dist_id_unit_test.cpp
index 364cc338b65fc42de6f25126c34f55cfa4707622..763ca8f5f957384f28247eb3a72fc4d2cebbb88d 100644
--- a/src/Grid/grid_dist_id_unit_test.cpp
+++ b/src/Grid/grid_dist_id_unit_test.cpp
@@ -1871,6 +1871,95 @@ BOOST_AUTO_TEST_CASE ( grid_ghost_correction )
 	Test_ghost_correction(domain,k,4);
 }
 
+BOOST_AUTO_TEST_CASE ( grid_overflow_round_off_error )
+{
+    size_t numGridPoint     =   100;
+    const double domainSize =   20851.7;
+    double domainLength = sqrt(domainSize);
+
+    Box<2,double> domain({0.0,0.0},{domainLength,domainLength});
+
+    size_t sz[2] = {numGridPoint,numGridPoint};
+
+    periodicity<2> bc = {PERIODIC,PERIODIC};
+
+    Ghost<2,double> g(3.0*(domain.getHigh(0) - domain.getLow(0))/numGridPoint + 0.001);
+
+    grid_dist_id<2, double, aggregate<double, double, double, double, double>> grid(sz,domain,g,bc);
+
+    auto & gs = grid.getGridInfo();
+
+    auto it = grid.getDomainIterator();
+
+    while (it.isNext())
+    {
+    	auto p = it.get();
+    	auto gp = it.getGKey(p);
+
+    	grid.get<0>(p) = gs.LinId(gp);
+
+    	++it;
+    }
+
+    grid.ghost_get<0>();
+
+    // Now we check
+
+    auto it2 = grid.getDomainIterator();
+
+    bool match = true;
+
+    while (it2.isNext())
+    {
+    	auto p = it2.get();
+    	auto gp = it.getGKey(p);
+
+    	if (gs.LinId(gp) != grid.get<0>(p))
+    	{match = false;}
+
+    	// look around
+
+    	auto px = p.move(0,1);
+    	auto gpx = it.getGKey(px);
+    	auto mx = p.move(0,-1);
+    	auto gmx = it.getGKey(mx);
+
+    	auto py = p.move(1,1);
+    	auto gpy = it.getGKey(py);
+    	auto my = p.move(1,-1);
+    	auto gmy = it.getGKey(my);
+
+    	gpx.set_d(0,gpx.get(0) % gs.size(0));
+    	gpx.set_d(1,gpx.get(1) % gs.size(1));
+
+    	if (grid.template get<0>(px) != gs.LinId(gpx))
+    	{match = false;}
+
+    	gmx.set_d(0,(gmx.get(0) + gs.size(0)) % gs.size(0));
+    	gmx.set_d(1,(gmx.get(1) + gs.size(1)) % gs.size(1));
+
+    	if (grid.template get<0>(mx) != gs.LinId(gmx))
+    	{match = false;}
+
+    	gpy.set_d(0,gpy.get(0) % gs.size(0));
+    	gpy.set_d(1,gpy.get(1) % gs.size(1));
+
+    	if (grid.template get<0>(py) != gs.LinId(gpy))
+    	{match = false;}
+
+    	gmy.set_d(0,(gmy.get(0) + gs.size(0)) % gs.size(0));
+    	gmy.set_d(1,(gmy.get(1) + gs.size(1)) % gs.size(1));
+
+    	if (grid.template get<0>(my) != gs.LinId(gmy))
+    	{match = false;}
+
+    	++it2;
+    }
+
+    BOOST_REQUIRE_EQUAL(match,true);
+}
+
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif
diff --git a/src/Grid/grid_dist_util.hpp b/src/Grid/grid_dist_util.hpp
index ed84a123e6c92160f1c6c84d9c7ec3a3e307303d..0d38b73a7eceb01b58777da73179323cfd1b8ab9 100644
--- a/src/Grid/grid_dist_util.hpp
+++ b/src/Grid/grid_dist_util.hpp
@@ -80,6 +80,15 @@ template<int dim, typename Decomposition> inline void create_gdb_ext(openfpm::ve
 		SpaceBox<Decomposition::dims, typename Decomposition::stype> sp = dec.getSubDomain(i);
 		SpaceBox<Decomposition::dims, typename Decomposition::stype> sp_g = dec.getSubDomainWithGhost(i);
 
+		// Because of round off we expand for safety the ghost area
+		// std::nextafter return the next bigger or smaller representable floating
+		// point number
+		for (size_t i = 0 ; i < Decomposition::dims ; i++)
+		{
+			sp_g.setLow(i,std::nextafter(sp_g.getLow(i),sp_g.getLow(i) - 1.0));
+			sp_g.setHigh(i,std::nextafter(sp_g.getHigh(i),sp_g.getHigh(i) + 1.0));
+		}
+
 		// Convert from SpaceBox<dim,St> to SpaceBox<dim,long int>
 		SpaceBox<Decomposition::dims,long int> sp_t = cd_sm.convertDomainSpaceIntoGridUnits(sp,dec.periodicity());
 		SpaceBox<Decomposition::dims,long int> sp_tg = cd_sm.convertDomainSpaceIntoGridUnits(sp_g,dec.periodicity());