diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile
index e78251aa5bc7a11f8d382e013340e0dbdf8c8ffc..bf092b81449e0dfa9710b98f55f7f0511056824c 100644
--- a/example/Performance/memBW/Makefile
+++ b/example/Performance/memBW/Makefile
@@ -14,19 +14,19 @@ ifdef HIP
 	CUDA_CC_LINK=hipcc
 else
 	ifdef CUDA_ON_CPU
-        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	CUDA_CC=mpic++ -g -x c++ $(INCLUDE_PATH)
         	INCLUDE_PATH_NVCC=
         	CUDA_CC_LINK=mpic++
         	CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000
         	LIBS_SELECT=$(LIBS)
 	else
         	ifeq (, $(shell which nvcc))
-                	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                	CUDA_CC=mpic++  -x c++ $(INCLUDE_PATH)
                 	INCLUDE_PATH_NVCC=
                 	CUDA_CC_LINK=mpic++
 			LIBS_SELECT=$(LIBS)
         	else
-                	CUDA_CC=nvcc -ccbin=mpic++
+                	CUDA_CC=nvcc -g -ccbin=mpic++
                 	CUDA_CC_LINK=nvcc -ccbin=mpic++
 			LIBS_SELECT=$(LIBS_NVCC)
         	endif
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index fb5383adbc7573a233c4f576cc642e69056ba443..428d38c4c736370a54e6c67eaf368a259dfc6e73 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -2834,6 +2834,37 @@ public:
 		}
 	}
 
+		/*! \brief apply a convolution on 2 property on GPU
+	 *
+	 *
+	 */
+	template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_src3, 
+	         unsigned int prop_dst1, unsigned int prop_dst2, unsigned int prop_dst3, 
+			 unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
+	void conv3_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
+	{
+		for (int i = 0 ; i < loc_grid.size() ; i++)
+		{
+			Box<dim,long int> inte;
+
+			Box<dim,long int> base;
+			for (int j = 0 ; j < dim ; j++)
+			{
+				base.setLow(j,(long int)start.get(j) - (long int)gdb_ext.get(i).origin.get(j));
+				base.setHigh(j,(long int)stop.get(j) - (long int)gdb_ext.get(i).origin.get(j));
+			}
+
+			Box<dim,long int> dom = gdb_ext.get(i).Dbox;
+
+			bool overlap = dom.Intersect(base,inte);
+
+			if (overlap == true)
+			{
+				loc_grid.get(i).template conv3_b<prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size>(inte.getKP1(),inte.getKP2(),func,args...);
+			}
+		}
+	}
+	
     template<typename NNtype>
     void findNeighbours()
     {
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index 3e516415a91f7d86224d1cd0dc125036b3f1aff0..a0582fd4315f550d4dfc033511cf810af1d61636 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -957,6 +957,109 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_b_test_3d )
 	BOOST_REQUIRE_EQUAL(match,true);
 }
 
+BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv3_b_test_3d )
+{
+	#ifdef CUDA_ON_CPU
+	size_t sz[3] = {20,20,20};
+	#else
+	size_t sz[3] = {60,60,60};
+	#endif
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+
+	Ghost<3,long int> g(1);
+
+	Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float,float,float>> gdist(sz,domain,g,bc);
+
+	gdist.template setBackgroundValue<0>(666);
+	gdist.template setBackgroundValue<1>(666);
+	gdist.template setBackgroundValue<2>(666);
+	gdist.template setBackgroundValue<3>(666);
+	gdist.template setBackgroundValue<4>(666);
+	gdist.template setBackgroundValue<5>(666);
+
+	/////// GPU insert + flush
+
+	Box<3,size_t> box({1,1,1},{sz[0]-1,sz[1]-1,sz[2]-1});
+
+	/////// GPU Run kernel
+
+	float c = 5.0;
+
+	typedef typename GetAddBlockType<decltype(gdist)>::type InsertBlockT;
+
+	gdist.addPoints(box.getKP1(),box.getKP2(),
+			        [] __device__ (int i, int j, int k)
+			        {
+						return true;
+			        },
+			        [c] __device__ (InsertBlockT & data, int i, int j, int k)
+			        {
+			        	data.template get<0>() = c + i + j + k;
+						data.template get<1>() = c + 1000 + i + j + k;
+						data.template get<2>() = c + 10000 + i + j + k;
+			        }
+			        );
+
+	gdist.template flush<smax_<0>,smax_<1>,smax_<2>>(flush_type::FLUSH_ON_DEVICE);
+
+	gdist.template ghost_get<0,1,2>(RUN_ON_DEVICE);
+
+	for (int i = 0 ; i < 10 ; i++)
+	{
+		gdist.template ghost_get<0,1,2>(RUN_ON_DEVICE);
+	}
+
+	// Now run the convolution
+
+	typedef typename GetCpBlockType<decltype(gdist),0,1>::type CpBlockType;
+
+	gdist.template conv3_b<0,1,2,3,4,5,1>({2,2,2},{(int)sz[0]-2,(int)sz[1]-2,(int)sz[2]-2},[] __device__ (float & u_out, float & v_out, float & m_out, CpBlockType & u, CpBlockType & v , CpBlockType & m, auto & block, int offset,int i, int j, int k){
+		u_out = u(i+1,j,k) - u(i-1,j,k) + u(i,j+1,k) - u(i,j-1,k) + u(i,j,k+1) - u(i,j,k-1) + block.template get<0>()[offset];
+		v_out = v(i+1,j,k) - v(i-1,j,k) + v(i,j+1,k) - v(i,j-1,k) + v(i,j,k+1) - v(i,j,k-1) + block.template get<1>()[offset];
+		m_out = m(i+1,j,k) - m(i-1,j,k) + m(i,j+1,k) - m(i,j-1,k) + m(i,j,k+1) - m(i,j,k-1) + block.template get<2>()[offset];
+	});
+
+	gdist.deviceToHost<0,1,2,3,4,5>();
+
+	// Now we check that ghost is correct
+
+	auto it3 = gdist.getSubDomainIterator({2,2,2},{(int)sz[0]-2,(int)sz[1]-2,(int)sz[2]-2});
+
+	bool match = true;
+
+	while (it3.isNext())
+	{
+		auto p = it3.get();
+
+		auto p_xp1 = p.move(0,1);
+		auto p_xm1 = p.move(0,-1);
+		auto p_yp1 = p.move(1,1);
+		auto p_ym1 = p.move(1,-1);
+		auto p_zp1 = p.move(2,1);
+		auto p_zm1 = p.move(2,-1);
+
+		float sub1 = gdist.template get<3>(p);
+		float sub2 = gdist.template get<4>(p);
+		float sub3 = gdist.template get<5>(p);
+
+		if (sub1 != 6.0 + gdist.template get<0>(p) || sub2 != 6.0 + gdist.template get<1>(p) || sub3 != 6.0 + gdist.template get<2>(p))
+		{
+			std::cout << sub1 << "  " << sub2 << "  " << sub3 << std::endl;
+			std::cout << gdist.template get<0>(p_xp1) << "   " << gdist.template get<0>(p_xm1) << std::endl;
+			std::cout << gdist.template get<1>(p_xp1) << "   " << gdist.template get<1>(p_xm1) << std::endl;
+			std::cout << gdist.template get<2>(p_xp1) << "   " << gdist.template get<2>(p_xm1) << std::endl;
+			match = false;
+			break;
+		}
+
+		++it3;
+	}
+
+	BOOST_REQUIRE_EQUAL(match,true);
+}
+
 BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_point_remove )
 {
 	size_t sz[3] = {60,60,60};