diff --git a/openfpm_data b/openfpm_data
index e91db651a61479ae33ef2a66c3cfce70315e16b6..66a3a95d0f1868b244555f70dfdd579ed3f1ee1f 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit e91db651a61479ae33ef2a66c3cfce70315e16b6
+Subproject commit 66a3a95d0f1868b244555f70dfdd579ed3f1ee1f
diff --git a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
index 54682ac72e2b5f6394144e2354cd66dc0a89904f..417902851405306098156eb95cc5296eefccb313 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
@@ -209,9 +209,9 @@ class grid_dist_iterator_sub
 	 * \return the actual key
 	 *
 	 */
-	inline grid_dist_key_dx<dim> get()
+	inline grid_dist_key_dx<dim,typename device_grid::base_key> get()
 	{
-		return grid_dist_key_dx<dim>(g_c,a_it.get());
+		return grid_dist_key_dx<dim,typename device_grid::base_key>(g_c,a_it.get());
 	}
 
 	/*! \brief Convert a g_dist_key_dx into a global key
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index ea381b032e10d06f044944e03cf9ec8d87afe3c2..1ee96dab7969da57c4848e3c3db5e7280ba401f9 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -2551,7 +2551,7 @@ public:
 	 *
 	 */
 	template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1, unsigned int prop_dst2, unsigned int stencil_size, unsigned int N, typename lambda_f, typename ... ArgsT >
-	void conv2(int (& stencil)[N][dim], grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
+	void conv2(int (& stencil)[N][dim], grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
 	{
 		for (int i = 0 ; i < loc_grid.size() ; i++)
 		{
@@ -2575,6 +2575,44 @@ public:
 		}
 	}
 
+	/*! \brief apply a convolution using the stencil N
+	 *
+	 *
+	 */
+	template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1, unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
+	void conv2(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
+	{
+		for (int i = 0 ; i < loc_grid.size() ; i++)
+		{
+			Box<dim,long int> inte;
+
+			Box<dim,long int> base;
+			for (int j = 0 ; j < dim ; j++)
+			{
+				base.setLow(j,(long int)start.get(j) - (long int)gdb_ext.get(i).origin.get(j));
+				base.setHigh(j,(long int)stop.get(j) - (long int)gdb_ext.get(i).origin.get(j));
+			}
+
+			Box<dim,long int> dom = gdb_ext.get(i).Dbox;
+
+			bool overlap = dom.Intersect(base,inte);
+
+			if (overlap == true)
+			{
+				loc_grid.get(i).template conv2<prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size>(inte.getKP1(),inte.getKP2(),func,args...);
+			}
+		}
+	}
+
+    template<typename NNtype>
+    void findNeighbours()
+    {
+		for (int i = 0 ; i < loc_grid.size() ; i++)
+		{
+			loc_grid.get(i).findNeighbours();
+		}
+    }
+
 	/*! \brief apply a convolution using the stencil N
 	 *
 	 *
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index 9caabf1c993653619d6e3a19be20617c9876ea3b..cc8899be5345679495e498c6bf589a654de801ca 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -287,9 +287,6 @@ void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2])
 		++it2;
 	}
 
-	gdist.write("after_ghost");
-
-	gdist.getDecomposition().write("sgrid_dec");
 
 	BOOST_REQUIRE_EQUAL(match,true);
 }
@@ -309,5 +306,82 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get )
 }
 
 
+BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test )
+{
+	size_t sz[2] = {164,164};
+	periodicity<2> bc = {PERIODIC,PERIODIC};
+
+	Ghost<2,long int> g(1);
+
+	Box<2,float> domain({0.0,0.0},{1.0,1.0});
+
+	sgrid_dist_id_gpu<2,float,aggregate<float,float,float,float>> gdist(sz,domain,g,bc);
+
+	gdist.template setBackgroundValue<0>(666);
+	gdist.template setBackgroundValue<1>(666);
+	gdist.template setBackgroundValue<2>(666);
+	gdist.template setBackgroundValue<3>(666);
+
+	/////// GPU insert + flush
+
+	Box<2,size_t> box({1,1},{sz[0],sz[1]});
+
+	/////// GPU Run kernel
+
+	float c = 5.0;
+
+	auto it = gdist.getGridIterator(box.getKP1(),box.getKP2());
+	gdist.template iterateGridGPU<insert_kernel2D<0>>(it,c);
+	gdist.template flush<smax_<0>>(flush_type::FLUSH_ON_DEVICE);
+
+	auto it2 = gdist.getGridIterator(box.getKP1(),box.getKP2());
+	gdist.template iterateGridGPU<insert_kernel2D<1>>(it2,c+1000);
+	gdist.template flush<smax_<0>,smax_<1>>(flush_type::FLUSH_ON_DEVICE);
+
+	gdist.template ghost_get<0,1>(RUN_ON_DEVICE);
+
+	// Now run the convolution
+
+	typedef typename GetCpBlockType<decltype(gdist),0,1>::type CpBlockType;
+
+	gdist.template conv2<0,1,2,3,1>({2,2},{(int)sz[0]-2,(int)sz[1]-2},[] __device__ (float & u_out, float & v_out, CpBlockType & u, CpBlockType & v,int i, int j){
+		u_out = u(i+1,j) - u(i-1,j) + u(i,j+1) - u(i,j-1);
+		v_out = v(i+1,j) - v(i-1,j) + v(i,j+1) - v(i,j-1);
+	});
+
+	gdist.deviceToHost<0,1,2,3>();
+
+	// Now we check that ghost is correct
+
+	auto it3 = gdist.getSubDomainIterator({2,2},{(int)sz[0]-2,(int)sz[1]-2});
+
+	bool match = true;
+
+	while (it3.isNext())
+	{
+		auto p = it3.get();
+
+		auto p_xp1 = p.move(0,1);
+		auto p_xm1 = p.move(0,-1);
+		auto p_yp1 = p.move(1,1);
+		auto p_ym1 = p.move(1,-1);
+
+		float sub1 = gdist.template get<2>(p);
+		float sub2 = gdist.template get<3>(p);
+
+		if (sub1 != 2.0 || sub2 != 4.0)
+		{
+			std::cout << sub1 << "  " << sub2 << std::endl;
+			std::cout << gdist.template get<0>(p_xp1) << "   " << gdist.template get<0>(p_xm1) << std::endl;
+			std::cout << gdist.template get<1>(p_xp1) << "   " << gdist.template get<1>(p_xm1) << std::endl;
+			break;
+		}
+
+		++it3;
+	}
+
+
+	BOOST_REQUIRE_EQUAL(match,true);
+}
 
 BOOST_AUTO_TEST_SUITE_END()