Compare revisions

daf30b1c · f6749b75 · d2a7fbed · 417942d4 · d4b71c57 · f6a17078
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,10 @@ find_package(Vc REQUIRED)
 if (CUDA_ON_BACKEND STREQUAL "HIP" AND NOT HIP_FOUND)
        find_package(HIP)
 endif()
-find_package(OpenMP)
+if (NOT CUDA_ON_BACKEND STREQUAL "HIP")
+	find_package(OpenMP)
+
+endif()

 ###### CONFIG.h FILE ######


--- a/src/Grid/Geometry/grid_smb.hpp
+++ b/src/Grid/Geometry/grid_smb.hpp
@@ -93,6 +93,8 @@ public:
        }
    }

+    static bool noPointers() {return true;}
+
 #ifdef __NVCC__
    //Constructors from dim3 and uint3 objects
    __host__ __device__ grid_smb(const dim3 blockDimensions)
@@ -340,6 +342,11 @@ public:
    	return sz;
    }

+    __host__ __device__ inline indexT getBlockEgdeSize() const
+    {
+        return blockEdgeSize;
+    }
+
    __host__ __device__ inline indexT getBlockSize() const
    {
    	return blockSize;

--- a/src/Grid/Geometry/grid_zmb.hpp
+++ b/src/Grid/Geometry/grid_zmb.hpp
@@ -141,6 +141,11 @@ public:
        grid_smb<dim,blockEdgeSize,indexT>::swap(other);
    }

+    __host__ __device__ inline indexT getBlockEgdeSize() const
+    {
+        return blockEdgeSize;
+    }
+
    __host__ __device__ inline indexT size() const
    {
        return grid_smb<dim,blockEdgeSize,indexT>::size();

--- a/src/Grid/comb.hpp
+++ b/src/Grid/comb.hpp
@@ -34,7 +34,7 @@ template<unsigned int dim>
 struct comb
 {
 	//! Array that store the combination
-	char c[dim];
+	signed char c[dim];

 	/*! \brief check if it is a valid combination
 	 *
@@ -105,7 +105,7 @@ struct comb
 	 * \return Result combination
 	 *
 	 */
-	inline comb<dim> operator&(char c_)
+	inline comb<dim> operator&(signed char c_)
 	{
 		comb<dim> ret;

@@ -246,7 +246,7 @@ struct comb
 	 *
 	 */

-	inline char operator[](int i) const
+	inline signed char operator[](int i) const
 	{
 		return c[i];
 	}
@@ -257,7 +257,7 @@ struct comb
 	 *
 	 */

-	inline char * getComb()
+	inline signed char * getComb()
 	{
 		return c;
 	}
@@ -268,7 +268,7 @@ struct comb
 	 *
 	 */

-	inline const char * getComb() const
+	inline const signed char * getComb() const
 	{
 		return c;
 	}
@@ -282,7 +282,7 @@ struct comb
 	 * \return value of the i index
 	 *
 	 */
-	inline char value(int i) const
+	inline signed char value(int i) const
 	{
 		return c[i];
 	}
@@ -314,10 +314,10 @@ struct comb
 	 * \param c list of numbers
 	 *
 	 */
-	comb(std::initializer_list<char> c)
+	comb(std::initializer_list<signed char> c)
 	{
 		size_t i = 0;
-	    for(char x : c)
+	    for(signed char x : c)
 	    {this->c[c.size() - i - 1] = x;i++;}
 	}

@@ -397,7 +397,7 @@ template<>
 struct comb<0>
 {
 	//! FIX
-	char c[0];
+	signed char c[0];

 	/*! \brief check if it is a valid combination
 	 *
@@ -467,7 +467,7 @@ struct comb<0>
 	 *
 	 */

-	inline char operator[](int i)
+	inline signed char operator[](int i)
 	{
 		return 0;
 	}
@@ -478,7 +478,7 @@ struct comb<0>
 	 *
 	 */

-	inline char * getComb()
+	inline signed char * getComb()
 	{
 		return c;
 	}
@@ -492,7 +492,7 @@ struct comb<0>
 	 * \return value of the i index
 	 *
 	 */
-	inline char value(int i) const
+	inline signed char value(int i) const
 	{
 		return c[i];
 	}

--- a/src/Grid/cuda/cuda_grid_gpu_funcs.cuh
+++ b/src/Grid/cuda/cuda_grid_gpu_funcs.cuh
@@ -94,12 +94,12 @@ __global__ void copy_ndim_grid_device(grid_type src, grid_type dst)
 #endif


-template<bool inte_or_lin,unsigned int dim, typename T>
+template<bool inte_or_lin, typename base_grid, unsigned int dim, typename T>
 struct grid_toKernelImpl
 {
-	template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type> toKernel(grid_type & gc)
+	template<typename grid_type> static base_grid toKernel(grid_type & gc)
 	{
-		grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type> g(gc.getGrid());
+		/*grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type>*/base_grid g(gc.getGrid());

 		g.get_data_().disable_manage_memory();
 		g.get_data_().mem = gc.get_internal_data_().mem;
@@ -112,12 +112,12 @@ struct grid_toKernelImpl
 	}
 };

-template<unsigned int dim, typename T>
-struct grid_toKernelImpl<true,dim,T>
+template<typename base_grid, unsigned int dim, typename T>
+struct grid_toKernelImpl<true,base_grid,dim,T>
 {
-	template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type> toKernel(grid_type & gc)
+	template<typename grid_type> static base_grid toKernel(grid_type & gc)
 	{
-		grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type> g(gc.getGrid());
+		/*grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type>*/ base_grid g(gc.getGrid());
 		copy_switch_memory_c_no_cpy<typename std::remove_reference<decltype(gc.get_internal_data_())>::type,
 				                    typename std::remove_reference<decltype(g.get_data_())>::type> cp_mc(gc.get_internal_data_(),g.get_data_());


--- a/src/Grid/cuda/cuda_grid_gpu_tests.cu
+++ b/src/Grid/cuda/cuda_grid_gpu_tests.cu
@@ -160,6 +160,70 @@ BOOST_AUTO_TEST_CASE (gpu_computation)
 	#endif
 }

+BOOST_AUTO_TEST_CASE (gpu_computation_lambda)
+{
+	#ifdef CUDA_GPU
+
+	{
+	size_t sz[3] = {64,64,64};
+	grid_gpu<3, aggregate<float,float[2],float[2][2]> > c3(sz);
+
+	c3.setMemory();
+
+	// Assign
+
+	auto c3_k = c3.toKernel();
+
+	auto lamb = [c3_k] __device__ (dim3 & blockIdx, dim3 & threadIdx)
+	{
+		grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x,
+			blockIdx.y * blockDim.y + threadIdx.y,
+			blockIdx.z * blockDim.z + threadIdx.z});
+
+		c3_k.template get<0>(p) = 5.0;
+
+		c3_k.template get<1>(p)[0] = 5.0;
+		c3_k.template get<1>(p)[1] = 5.0;
+	
+		c3_k.template get<2>(p)[0][0] = 5.0;
+		c3_k.template get<2>(p)[0][1] = 5.0;
+		c3_k.template get<2>(p)[1][0] = 5.0;
+		c3_k.template get<2>(p)[1][1] = 5.0;
+	};
+
+	auto ite = c3.getGPUIterator({0,0,0},{63,63,63});
+
+	CUDA_LAUNCH_LAMBDA(ite,lamb);
+
+	c3.deviceToHost<0,1,2>();
+
+	auto it = c3.getIterator();
+
+	bool good = true;
+	while(it.isNext())
+	{
+		auto key = it.get();
+
+		good &= c3.template get<0>(key) == 5.0;
+
+		good &= c3.template get<1>(key)[0] == 5.0;
+		good &= c3.template get<1>(key)[1] == 5.0;
+
+		good &= c3.template get<2>(key)[0][0] == 5.0;
+		good &= c3.template get<2>(key)[0][1] == 5.0;
+		good &= c3.template get<2>(key)[1][0] == 5.0;
+		good &= c3.template get<2>(key)[1][1] == 5.0;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(good,true);
+
+	}
+
+	#endif
+}
+
 BOOST_AUTO_TEST_CASE (gpu_computation_stencil)
 {
 	#ifdef CUDA_GPU

--- a/src/Grid/cuda/map_grid_cuda_ker.cuh
+++ b/src/Grid/cuda/map_grid_cuda_ker.cuh
@@ -158,7 +158,7 @@ class grid_gpu_ker
 	typedef typename layout_base<T_>::type layout;

 	//! layout data
-	layout data_;
+	mutable layout data_;



@@ -264,14 +264,32 @@ public:
 	 * \return the const reference of the element
 	 *
 	 */
-	template <unsigned int p, typename ids_type, typename r_type=decltype(layout_base<T_>::template get_c<p>(data_,g1,grid_key_dx<dim>()))>
-	__device__ __host__ inline const r_type get(const grid_key_dx<dim,ids_type> & v1) const
+	template <unsigned int p, typename ids_type, typename r_type=decltype(layout_base<T_>::template get<p>(data_,g1,grid_key_dx<dim>()))>
+	__device__ __host__ inline r_type get_debug(const grid_key_dx<dim,ids_type> & v1) const
 	{
 #ifdef SE_CLASS1
 		if (check_bound(v1) == false)
 		{fill_grid_error_array_overflow<dim,p>(this->template getPointer<p>(),v1);}
 #endif
-		return layout_base<T_>::template get_c<p>(data_,g1,v1);
+
+		return layout_base<T_>::template get<p>(data_,g1,v1);
+	}
+
+	/*! \brief Get the const reference of the selected element
+	 *
+	 * \param v1 grid_key that identify the element in the grid
+	 *
+	 * \return the const reference of the element
+	 *
+	 */
+	template <unsigned int p, typename ids_type, typename r_type=decltype(layout_base<T_>::template get<p>(data_,g1,grid_key_dx<dim>()))>
+	__device__ __host__ inline r_type get(const grid_key_dx<dim,ids_type> & v1) const
+	{
+#ifdef SE_CLASS1
+		if (check_bound(v1) == false)
+		{fill_grid_error_array_overflow<dim,p>(this->template getPointer<p>(),v1);}
+#endif
+		return layout_base<T_>::template get<p>(data_,g1,v1);
 	}

 	/*! \brief Get the reference of the selected element

--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -620,7 +620,7 @@ private:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);

 #endif
 	}
@@ -705,8 +705,8 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
-		g.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(g);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
+		g.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(g);

 #endif

@@ -726,7 +726,7 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T,layout_base,linearizer_type>,dim,T_>::toKernel(*this);

 #endif

@@ -809,7 +809,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
+	struct ite_gpu<dim> getGPUIterator(const grid_key_dx<dim,long int> & key1, const grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
 	}
@@ -850,12 +850,25 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);

 #endif

 	}

+	/*! \brief Return the memory object
+	 *
+	 * Return the memory object
+	 *
+	 * \tparam p array to retrieve
+	 *
+	 */
+	template<unsigned int p>
+	auto getMemory() -> decltype(boost::fusion::at_c<p>(data_).getMemory())
+	{
+		return boost::fusion::at_c<p>(data_).getMemory();
+	}
+
 	/*! \brief Set the object that provide memory from outside
 	 *
 	 * An external allocator is useful with allocator like PreAllocHeapMem
@@ -880,7 +893,7 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);

 #endif
 	}
@@ -909,7 +922,7 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);

 #endif
 	}
@@ -1506,7 +1519,7 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);

 #endif
 	}
@@ -1536,8 +1549,8 @@ public:

 #if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)

-		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
-		grid.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(grid);
+		base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
+		grid.base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(grid);

 #endif
 	}
@@ -1876,7 +1889,7 @@ public:
 	 */
 	grid_gpu_ker<dim,T_,layout_base,linearizer_type> toKernel()
 	{
-		return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
 	}

 	/*! \brief Convert the grid into a data-structure compatible for computing into GPU
@@ -1886,7 +1899,7 @@ public:
 	 */
 	const grid_gpu_ker<dim,T_,layout_base,linearizer_type> toKernel() const
 	{
-		return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,dim,T_>::toKernel(*this);
+		return grid_toKernelImpl<is_layout_inte<layout_base<T_>>::value,grid_gpu_ker<dim,T_,layout_base,linearizer_type>,dim,T_>::toKernel(*this);
 	}

 #endif

--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -108,12 +108,12 @@ struct ite_gpu
 	grid_key_dx<dim,int> start;
 	grid_key_dx<dim,int> stop;

-	size_t nblocks()
+	size_t nblocks() const
 	{
 		return wthr.x * wthr.y * wthr.z;
 	}

-	size_t nthrs()
+	size_t nthrs() const
 	{
 		return thr.x * thr.y * thr.z;
 	}
@@ -365,6 +365,25 @@ public:
 		}
 	}

+	/*! \brief copy constructor
+	 *
+	 * \param g grid info
+	 *
+	 * construct a grid from another grid. As a copy constructor can't be template,
+	 * the one above won't work for const grid_sm<N,S> & g, where S=T
+	 *
+	 */
+	__device__ __host__ inline grid_sm(const grid_sm<N,T> & g)
+	{
+		size_tot = g.size_tot;
+
+		for (size_t i = 0 ; i < N ; i++)
+		{
+			sz[i] = g.sz[i];
+			sz_s[i] = g.sz_s[i];
+		}
+	}
+
 	// Static element to calculate total size

 	inline size_t totalSize(const size_t sz)
@@ -432,7 +451,7 @@ public:
 	 */

 	template<typename check=NoCheck, typename ids_type>
-	inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const char sum_id[N]) const
+	inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const signed char sum_id[N]) const
 	{
 		mem_id lid;

@@ -468,7 +487,7 @@ public:
 	 */

 	template<typename check=NoCheck,typename ids_type>
-	inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const char sum_id[N], const size_t (&bc)[N]) const
+	inline mem_id LinId(const grid_key_dx<N,ids_type> & gk, const signed char sum_id[N], const size_t (&bc)[N]) const
 	{
 		mem_id lid;


--- a/src/Packer_Unpacker/Pack_selector.hpp
+++ b/src/Packer_Unpacker/Pack_selector.hpp
@@ -43,7 +43,7 @@
 *
 *
 */
-template <typename T, bool has_noPointers>
+template <typename T, int has_noPointers>
 struct Pack_selector_unknown_type_impl
 {
 	enum
@@ -53,7 +53,7 @@ struct Pack_selector_unknown_type_impl
 };

 template <typename T>
-struct Pack_selector_unknown_type_impl<T,false>
+struct Pack_selector_unknown_type_impl<T,0>
 {
 	enum
 	{
@@ -61,6 +61,28 @@ struct Pack_selector_unknown_type_impl<T,false>
 	};
 };

+/*! \brief Pack selector for unknown type
+ *
+ *
+ */
+template <typename T>
+struct Pack_selector_unknown_type_impl<T,2>
+{
+	enum
+	{
+		value = PACKER_ARRAY_CP_PRIMITIVE
+	};
+};
+
+template <typename T>
+struct Pack_selector_unknown_type_impl<T,3>
+{
+	enum
+	{
+		value = PACKER_ARRAY_CP_PRIMITIVE
+	};
+};
+
 /*! \brief Pack selector for unknown type
 *
 *
@@ -70,7 +92,7 @@ struct Pack_selector_known_type_impl
 {
 	enum
 	{
-		value = Pack_selector_unknown_type_impl<T, has_noPointers<T>::value >::value
+		value = Pack_selector_unknown_type_impl<T, has_noPointers<T>::value + 2*std::is_array<T>::value >::value
 	};
 };


--- a/src/Space/Shape/Point.hpp
+++ b/src/Space/Shape/Point.hpp
@@ -228,7 +228,7 @@ template<unsigned int dim ,typename T> class Point
 	 * \return the norm of the vector
 	 *
 	 */
-	__device__ __host__ T norm()
+	__device__ __host__ T norm() const
 	{
 		T n = 0.0;


--- a/src/SparseGridGpu/BlockMapGpu.hpp
+++ b/src/SparseGridGpu/BlockMapGpu.hpp
@@ -14,6 +14,29 @@ using BlockTypeOf = typename std::remove_reference<typename boost::fusion::resul
 template<typename AggregateT, unsigned int p>
 using ScalarTypeOf = typename std::remove_reference<typename boost::fusion::result_of::at_c<typename AggregateT::type, p>::type>::type::scalarType;

+template<typename T>
+struct meta_copy_set_bck
+{
+    template<typename destType>
+    inline static void set(destType & bP ,T & backgroundValue, int j)
+    {
+        bP[j] = backgroundValue;
+    }
+};
+
+template<unsigned int N, typename T>
+struct meta_copy_set_bck<T[N]>
+{
+    template<typename destType>
+    inline static void set(destType & bP ,T * backgroundValue, int j)
+    {
+        for (int i = 0 ; i < N ; i++)
+        {
+            bP[i][j] = backgroundValue[i];
+        }
+    }
+};
+
 template<typename AggregateBlockT, unsigned int threadBlockSize=128, typename indexT=long int, template<typename> class layout_base=memory_traits_inte>
 class BlockMapGpu
 {
@@ -280,19 +303,21 @@ public:
     * \tparam p property p
     *
     */
-    template<unsigned int p>
-    void setBackgroundValue(ScalarTypeOf<AggregateBlockT, p> backgroundValue)
+    template<unsigned int p, typename TypeBck>
+    void setBackgroundValue(TypeBck backgroundValue)
    {
        // NOTE: Here we assume user only passes Blocks and not scalars in the templated aggregate type
        typedef BlockTypeOf<AggregateInternalT, p> BlockT;
+        typedef typename std::remove_all_extents<BlockTypeOf<AggregateInternalT, p>>::type BlockT_noarr;
        typedef BlockTypeOf<AggregateInternalT, pMask> BlockM;

        BlockT bP;
        BlockM bM;

-        for (unsigned int i = 0; i < BlockT::size; ++i)
+        for (unsigned int i = 0; i < BlockT_noarr::size; ++i)
        {
-            bP[i] = backgroundValue;
+            meta_copy_set_bck<TypeBck>::set(bP,backgroundValue,i);
+            //meta_copy<TypeBck>::meta_copy_(backgroundValue,bP[][i]);
            bM[i] = 0;
        }


--- a/src/SparseGridGpu/SparseGridGpu.hpp
+++ b/src/SparseGridGpu/SparseGridGpu.hpp
@@ -718,6 +718,8 @@ public:

    typedef sparse_grid_gpu_index<self> base_key;

+	typedef indexT indexT_;
+
    typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;

    /*! \brief return the size of the grid
@@ -1045,6 +1047,7 @@ private:

    }

+
    template <typename stencil, typename... Args>
    void applyStencilInPlaceNoShared(const Box<dim,int> & box, StencilMode & mode,Args... args)
    {
@@ -1795,6 +1798,11 @@ public:
 	void setMemory()
 	{}

+    auto insertBlockFlush(size_t block) -> decltype(BMG::insertBlockFlush(block))
+    {
+        return BMG::insertBlockFlush(block);
+    }
+
    /*! \brief Return the grid information object
     *
     * \return grid information object
@@ -2628,6 +2636,28 @@ public:
 		applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
 	}

+    /*! \brief Apply a free type convolution using blocks
+     *
+     *
+     */
+	template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_src3,
+	         unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int prop_dst3,
+			 unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
+	void conv3_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
+	{
+		Box<dim,int> box;
+
+		for (int i = 0 ; i < dim ; i++)
+		{
+			box.setLow(i,start.get(i));
+			box.setHigh(i,stop.get(i));
+		}
+
+        constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
+
+		applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
+	}
+	
    /*! \brief Apply a free type convolution using blocks
     *
     *
@@ -2753,12 +2783,37 @@ public:

 		typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;

-		auto block_data = this->insertBlockFlush(block_id);
+		auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);
 		block_data.template get<BMG::pMask>()[local_id] = 1;

 		return block_data.template get<p>()[local_id];
 	}
    
+	/*! \brief Insert the point on host side and flush directly
+ *
+ * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
+ *
+ * \param grid point where to insert
+ *
+ * \return a reference to the data to fill
+ *
+ *
+ */
+	template<typename CoordT>
+	auto insertBlockFlush(const grid_key_dx<dim,CoordT> &coord, indexT & local_id) -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(0))
+	{
+    	auto lin = gridGeometry.LinId(coord);
+    	indexT block_id = lin / blockSize;
+    	local_id = lin % blockSize;
+
+		typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;
+
+		auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);
+		block_data.template get<BMG::pMask>()[local_id] = 1;
+
+		return block_data;
+	}
+
    /*! \brief Insert the point on host side and flush directly
     *
     * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
@@ -2779,7 +2834,7 @@ public:

    	typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG;

-    	auto block_data = this->insertBlockFlush(block_id);
+    	auto block_data = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insertBlockFlush(block_id);
    	block_data.template get<BMG::pMask>()[local_id] = 1;

    	return block_data.template get<p>()[local_id];
@@ -2817,15 +2872,105 @@ public:
     *
     */
    template<unsigned int p>
-    void setBackgroundValue(ScalarTypeOf<AggregateBlockT, p> backgroundValue)
+    void setBackgroundValue(typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
    {
-        bck.template get<p>() = backgroundValue;
+		meta_copy<typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>::meta_copy_(backgroundValue,bck.template get<p>());

-        BMG::template setBackgroundValue<p>(backgroundValue);
+        BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
    }

    /////////////////////////////////// DISTRIBUTED INTERFACE ///////////////////////

+	//Functions to check if the packing object is complex
+	static bool pack()
+	{
+		return true;
+	}
+
+	//Functions to check if the packing object is complex
+	static bool packRequest()
+	{
+		return true;
+	}
+
+	/*! \brief Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory
+	 *
+	 * 
+	 */
+	template<int ... prp> inline
+	void packRequest(size_t & req) const
+    {
+		// To fill
+        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
+        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
+
+		indexBuffer.template packRequest<prp ...>(req);
+		dataBuffer.template packRequest<prp ...>(req);
+
+		Packer<decltype(gridGeometry),HeapMemory>::packRequest(req);
+	}
+
+	/*! \brief Pack the object into the memory
+	 *
+	 * \tparam prp properties to pack
+	 *
+	 * \param mem preallocated memory where to pack the objects
+	 *
+	 * \param sts pack statistic
+	 *
+	 */
+	template<int ... prp> void pack(ExtPreAlloc<HeapMemory> & mem,
+									Pack_stat & sts) const
+	{
+        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
+        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
+
+		// To fill
+		indexBuffer.template pack<prp ...>(mem,sts);
+		dataBuffer.template pack<prp ...>(mem,sts);
+
+		Packer<decltype(gridGeometry),HeapMemory>::pack(mem,gridGeometry,sts);
+	}
+
+	/*! \brief Unpack the object into the memory
+	 *
+	 * \tparam prp properties to pack
+	 *
+	 * \param mem preallocated memory where to pack the objects
+	 *
+	 * \param sts pack statistic
+	 *
+	 */
+	template<int ... prp> void unpack(ExtPreAlloc<HeapMemory> & mem,
+									Unpack_stat & ps)
+	{
+        auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
+        auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
+
+		// To fill
+		indexBuffer.template unpack<prp ...>(mem,ps);
+		dataBuffer.template unpack<prp ...>(mem,ps);
+
+		Unpacker<decltype(gridGeometry),HeapMemory>::unpack(mem,gridGeometry,ps);
+	}
+
+
+	/*! \brief Unpack the object into the memory
+	 *
+	 * \tparam prp properties to pack
+	 *
+	 * \param mem preallocated memory where to pack the objects
+	 *
+	 * \param sts pack statistic
+	 *
+	 */
+	template<int ... prp> void unpack(ExtPreAlloc<CudaMemory> & mem,
+									Unpack_stat & ps)
+	{
+		if (mem.size() != 0)
+		{std::cout << __FILE__ << ":" << __LINE__ << " not implemented: " << std::endl;}
+	}
+
    /*! \brief memory requested to pack this object
     *
     * \param req request
@@ -3253,6 +3398,8 @@ public:
 	 */
 	void swap(self & gr)
 	{
+		gridGeometry.swap(gr.gridGeometry);
+
 		BMG::swap(gr);
 	}

@@ -3422,6 +3569,7 @@ public:

 		result.allocate(sizeof(int));

+		if (pointers.size())
 		CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
 																											 pointers.toKernel(),
 																											 headers.toKernel(),

--- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh
+++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh
@@ -158,6 +158,19 @@ namespace SparseGridGpuKernels
 		{
 			f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
 		}
+		
+		template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
+		__device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
+				            CpBlockType & cpb1,
+				            CpBlockType & cpb2,
+							CpBlockType & cpb3,
+							DataBlockWrapperT & DataBlockLoad,
+							int offset,
+				            lambda_func f,
+				            ArgsT ... args)
+		{
+			f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1],coord[2]);
+		}
 	};

 	template<>
@@ -204,6 +217,19 @@ namespace SparseGridGpuKernels
 		{
 			f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]);
 		}
+		
+        template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT>
+		__device__ static inline void stencil3_block(ScalarT & res1, ScalarT & res2, ScalarT & res3, coordType & coord ,
+				            CpBlockType & cpb1,
+				            CpBlockType & cpb2,
+							CpBlockType & cpb3,
+							DataBlockWrapperT & DataBlockLoad,
+							int offset,
+				            lambda_func f,
+				            ArgsT ... args)
+		{
+			f(res1,res2,res3,cpb1,cpb2,cpb3,DataBlockLoad,offset,coord[0],coord[1]);
+		}
 	};

 	template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size>
@@ -424,6 +450,86 @@ namespace SparseGridGpuKernels
 	    }
 	};

+    template<unsigned int dim, unsigned int n_loop, 
+			 unsigned int p_src1, unsigned int p_src2, unsigned int p_src3,
+			 unsigned int p_dst1, unsigned int p_dst2, unsigned int p_dst3,
+			 unsigned int stencil_size>
+	struct stencil_func_conv3_b
+	{
+		typedef NNStar<dim> stencil_type;
+
+		static constexpr unsigned int supportRadius = stencil_size;
+
+		template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT>
+		static inline __device__ void stencil(
+				SparseGridT & sparseGrid,
+				const unsigned int dataBlockId,
+				openfpm::sparse_index<unsigned int> dataBlockIdPos,
+				unsigned int offset,
+				grid_key_dx<dim, int> & pointCoord,
+				DataBlockWrapperT & dataBlockLoad,
+				DataBlockWrapperT & dataBlockStore,
+				unsigned char curMask,
+				lambda_func f,
+				ArgT ... args)
+		{
+	        typedef typename SparseGridT::AggregateBlockType AggregateT;
+	        typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1;
+	        typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2;
+			typedef ScalarTypeOf<AggregateT, p_src1> ScalarT3;
+
+	        constexpr unsigned int enlargedBlockSize = IntPow<
+	                SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
+
+	        __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize];
+	        __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize];
+			__shared__ ScalarT3 enlargedBlock3[enlargedBlockSize];
+
+	        // fill with background
+
+	        typedef typename vmpl_create_constant<dim,SparseGridT::blockEdgeSize_>::type block_sizes;
+	        typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes;
+
+	        cp_block<ScalarT1,stencil_size,vmpl_sizes,dim> cpb1(enlargedBlock1);
+	        cp_block<ScalarT2,stencil_size,vmpl_sizes,dim> cpb2(enlargedBlock2);
+			cp_block<ScalarT3,stencil_size,vmpl_sizes,dim> cpb3(enlargedBlock3);
+
+	        sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1);
+	        sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2);
+			sparseGrid.template loadGhostBlock<p_src3>(dataBlockLoad, dataBlockIdPos, enlargedBlock3);
+
+	        __syncthreads();
+
+	        ScalarT1 res1 = 0;
+	        ScalarT2 res2 = 0;
+			ScalarT3 res3 = 0;
+
+	        if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING))
+	        {
+	        	int coord[dim];
+
+				unsigned int linIdTmp = offset;
+				for (unsigned int d = 0; d < dim; ++d)
+				{
+					coord[d] = linIdTmp % SparseGridT::blockEdgeSize_;
+					linIdTmp /= SparseGridT::blockEdgeSize_;
+				}
+
+	            stencil_conv_func_impl<dim>::stencil3_block(res1,res2,res3,coord,cpb1,cpb2,cpb3,dataBlockLoad,offset,f,args...);
+
+	            dataBlockStore.template get<p_dst1>()[offset] = res1;
+	            dataBlockStore.template get<p_dst2>()[offset] = res2;
+				dataBlockStore.template get<p_dst3>()[offset] = res3;
+	        }
+		}
+
+	    template <typename SparseGridT, typename CtxT>
+	    static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx)
+	    {
+	        // No flush
+	    }
+	};
+    
 	template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size>
 	struct stencil_func_conv2
 	{
@@ -966,7 +1072,7 @@ namespace SparseGridGpuKernels
        auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible

        // todo: Add management of RED-BLACK stencil application! :)
-        const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
+        const auto dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
        grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);

        unsigned char curMask;

--- a/src/Vector/cuda/map_vector_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_cuda_ker.cuh
@@ -172,6 +172,11 @@ namespace openfpm
 			return v_size;
 		}

+        __host__ __device__ size_t size_local() const
+        {
+            return size();
+        }
+
 		/*! \brief return the maximum capacity of the vector before reallocation
 		 *
 		 * \return the capacity of the vector
@@ -204,6 +209,38 @@ namespace openfpm

 			return base.template get<p>(key);
 		}
+        /*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \tparam p Property to get
+		 * \param id Element to get
+		 *
+		 * \return the element value requested
+		 *
+		 */
+        template <unsigned int p>
+        __device__ __host__ inline auto getProp(unsigned int id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+        {
+            return this->get<p>(id);
+        }
+
+
+        /*! \brief Get an element of the vector
+		 *
+		 * Get an element of the vector
+		 *
+		 * \tparam p Property to get
+		 * \param id Element to get
+		 *
+		 * \return the element value requested
+		 *
+		 */
+        template <unsigned int p, typename key_type>
+        __device__ __host__ inline auto getProp(key_type id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+        {
+            return this->get<p>(id.getKey());
+        }

 		/*! \brief Get an element of the vector
 		 *
@@ -488,7 +525,22 @@ namespace openfpm

 			return base.getGPUIterator(start,stop,n_thr);
 		}
-
+        /*! \brief Get a domain iterator for the GPU
+         *
+         *
+         */
+        ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
+        {
+            return getGPUIterator(n_thr);
+        }
+
+        //Stub for some expression
+        void init() const {}
+
+        __host__ __device__ auto value(unsigned int p) -> decltype(base.template get<0>(grid_key_dx<1>(0)))
+        {
+            return get<0>(p);
+        }
 		/*! \brief Get an iterator for the GPU
 		 *
 		 *
@@ -501,6 +553,7 @@ namespace openfpm
 			return base.getGPUIterator(start,stop_,n_thr);
 		}

+
 		/*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers
 		 *
 		 * \param object to copy
@@ -514,6 +567,16 @@ namespace openfpm
 			return *this;
 		}

+        __device__ __host__ vector_gpu_ker<T,layout_base> & getVector()
+        {
+            return *this;
+        }
+
+        __device__ __host__ const vector_gpu_ker<T,layout_base> & getVector() const
+        {
+            return *this;
+        }
+
 		/*! \brief Return the base
 		 *
 		 * \return the base
@@ -596,6 +659,11 @@ namespace openfpm
 			return vref.size();
 		}

+        __host__ __device__ size_t size_local() const
+        {
+            return size();
+        }
+
 		__device__ __host__ unsigned int capacity() const
 		{
 			return vref.capacity;
@@ -692,6 +760,16 @@ namespace openfpm
 			return vref.getGPUItertatorTo(stop,n_thr);
 		}

+        vector_gpu_ker<T,layout_base> & getVector()
+        {
+         return *this;
+        }
+
+        const vector_gpu_ker<T,layout_base> & getVector() const
+        {
+            return *this;
+        }
+
 		__host__ vector_gpu_ker_ref<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
 		{
 			vref.operator=(v);

--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -336,7 +336,7 @@ namespace openfpm
        *
        * \return the size
        *
-        */
+        */ //remove host device
        size_t size_local() const
        {
            return v_size;
@@ -1359,7 +1359,21 @@ namespace openfpm

 			return base.get_o(key);
 		}
-
+        /*! \brief Get an element of the vector
+         *
+         * Get an element of the vector
+         *
+         * \tparam p Property to get
+         * \param id Element to get
+         *
+         * \return the element value requested
+         *
+         */ //remove device host
+        template <unsigned int p>
+        inline auto getProp(const unsigned int & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+        {   //uncomment this
+            return this->template get<p>(id);
+        }
 		/*! \brief Get an element of the vector
 		 *
 		 * Get an element of the vector
@@ -1369,12 +1383,12 @@ namespace openfpm
 		 *
 		 * \return the element value requested
 		 *
-		 */
-
-		template <unsigned int p,typename KeyType>
-		inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		 *///remove host device
+        template <unsigned int p,typename KeyType>
+        inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0)))
 		{
-			return this->template get<p>(id.getKey());
+            //uncomment this
+            return this->template get<p>(id.getKey());
 		}

 		/*! \brief Get an element of the vector
@@ -1386,10 +1400,10 @@ namespace openfpm
 		 *
 		 * \return the element value requested
 		 *
-		 */
+		 */ //remove device host
 		template <unsigned int p, typename keyType>
-		inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
-		{
+         inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0)))
+		{   //uncomment this
 			return this->template get<p>(id.getKey());
 		}

@@ -1930,6 +1944,7 @@ namespace openfpm
 			return base.getGPUIterator(start,stop_,n_thr);
 		}

+
 #endif

 		/*! \brief Get the vector elements iterator
@@ -1987,6 +2002,14 @@ namespace openfpm

 			return base.getGPUIterator(start,stop,n_thr);
 		}
+        /*! \brief Get a domain iterator for the GPU
+         *
+         *
+         */
+        ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const
+        {
+            return getGPUIterator(n_thr);
+        }

 #endif
 		/*! \brief Return the size of the message needed to pack this object
@@ -2071,6 +2094,19 @@ namespace openfpm
 			return 1;
 		}

+		/*! \brief Return the memory object
+		*
+		* Return the memory object
+		*
+		* \tparam p array to retrieve
+		*
+		*/
+		template<unsigned int p>
+		auto getMemory() -> decltype(base.template getMemory<p>())
+		{
+			return base.template getMemory<p>();
+		}
+
 		/*! \brief Set the memory of the base structure using an object
 		 *
 		 * \param mem Memory object to use for allocation

--- a/src/Vector/map_vector_std.hpp
+++ b/src/Vector/map_vector_std.hpp
@@ -1016,6 +1016,18 @@ public:
 		return base_type::get(id);
 	}

+	/*! \brief Get an element of the vector
+	 *
+	 * \param id element to get
+	 *
+	 * \return the element reference
+	 *
+	 */
+	inline T & operator[](size_t id)
+	{
+		return base_type::get(id);
+	}
+
 	/*! \brief Get an element of the vector
 	 *
 	 * \param id element to get

--- a/src/Vector/vector_pack_unpack.ipp
+++ b/src/Vector/vector_pack_unpack.ipp
@@ -339,7 +339,7 @@ template<int ... prp> inline void pack(ExtPreAlloc<HeapMemory> & mem, Pack_stat
 * \param mem preallocated memory from where to unpack the vector
 * \param ps unpack-stat info
 */
-template<int ... prp> inline void unpack(ExtPreAlloc<HeapMemory> & mem, Unpack_stat & ps)
+template<int ... prp, typename MemType> inline void unpack(ExtPreAlloc<MemType> & mem, Unpack_stat & ps)
 {
 	//if all of the aggregate properties are simple (don't have "pack()" member)
 	if (has_pack_agg<T,prp...>::result::value == false)
@@ -353,7 +353,7 @@ template<int ... prp> inline void unpack(ExtPreAlloc<HeapMemory> & mem, Unpack_s
 	{
 		//Unpack a size of a source vector
 		size_t u2 = 0;
-		Unpacker<size_t, HeapMemory>::unpack(mem,u2,ps);
+		Unpacker<size_t, MemType>::unpack(mem,u2,ps);
 		
 		//Resize a destination vector
 		this->resize(u2);
@@ -361,7 +361,7 @@ template<int ... prp> inline void unpack(ExtPreAlloc<HeapMemory> & mem, Unpack_s
 		for (size_t i = 0 ; i < this->size() ; i++)
 		{
 			//Call an unpacker in nested way
-			call_aggregateUnpack<decltype(this->get(i)),HeapMemory,prp ... >::call_unpack(this->get(i),mem,ps);
+			call_aggregateUnpack<decltype(this->get(i)),MemType,prp ... >::call_unpack(this->get(i),mem,ps);
 		}
 	}
 }

--- a/src/util/cuda/merge_ofp.cuh
+++ b/src/util/cuda/merge_ofp.cuh
@@ -24,11 +24,14 @@
        #define __CUDACC__
        #define __CUDA__
     #else
-        #include "util/cuda/moderngpu/kernel_merge.hxx"
+        #include <thrust/merge.h>
+        #include <thrust/execution_policy.h>
     #endif
     #endif
 #else
-    #include "util/cuda/moderngpu/kernel_merge.hxx"
+    #include <thrust/merge.h>
+    #include <thrust/execution_policy.h>
+//    #include "util/cuda/moderngpu/kernel_merge.hxx"
 #endif
 #include "util/cuda/ofp_context.hxx"
 
@@ -98,7 +101,14 @@

        #else

-            mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context);
+//            It seems broken on some CUDA on some hardware. Anyway is not anymore supported 
+//            on some hardware ... we move to thrust
+//            mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context);
+
+            thrust::merge_by_key(thrust::device, a_keys,a_keys + a_count, 
+                                                 b_keys,b_keys + b_count, 
+                                                 a_vals,b_vals,
+                                                 c_keys,c_vals,comp);

        #endif


--- a/src/util/cuda/reduce_ofp.cuh
+++ b/src/util/cuda/reduce_ofp.cuh
@@ -27,7 +27,8 @@
 #else
 	// Here we have old CUDA
 	#include "cub_old/cub.cuh"
-	#include "util/cuda/moderngpu/kernel_reduce.hxx"
+	//#include "util/cuda/moderngpu/kernel_reduce.hxx"
+	#define REDUCE_WITH_CUB
 #endif

 #include "util/cuda/ofp_context.hxx"
@@ -98,4 +99,4 @@ namespace openfpm

 #endif

-#endif /* REDUCE_OFP_HPP_ */
\ No newline at end of file
+#endif /* REDUCE_OFP_HPP_ */
No results found