diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp
index f9ca655e2b156cf9254856847ae0a471a614a94c..2b4e85d69bfeb294d72676b6125c13e8dab9baa8 100644
--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -1091,28 +1091,6 @@ public:
 			     const Box<dim,long int> & box_src,
 				 const Box<dim,long int> & box_dst)
 	{
-
-
-		// sub-grid where to unpack
-/*		grid_key_dx_iterator_sub<dim> src(grid_src.getGrid(),box_src.getKP1(),box_src.getKP2());
-		grid_key_dx_iterator_sub<dim> dst(getGrid(),box_dst.getKP1(),box_dst.getKP2());
-
-		while (src.isNext())
-		{
-			auto key_src = src.get();
-			auto key_dst = dst.get();
-
-			get_o(key_dst) = grid_src.get_o(key_src);
-
-			++src;
-			++dst;
-		}*/
-
-		///////////////////////////////////////
-
-//        typedef typename std::remove_reference<decltype(grid_src)>::type grid_cp;
-//        typedef typename std::remove_reference<decltype(grid_src.getGrid())>::type grid_info_cp;
-
 		// fix box_dst
 
 	     Box<dim,size_t> box_src_;
@@ -1226,21 +1204,6 @@ public:
 			++sub_src;
 			++sub_dst;
 		}
-
-/*		grid_key_dx_iterator_sub<dim> sub_src(grid_src.getGrid(),box_src.getKP1(),box_src.getKP2());
-		grid_key_dx_iterator_sub<dim> sub_dst(this->getGrid(),box_dst.getKP1(),box_dst.getKP2());
-
-//		const auto & gs = loc_grid.get(i);
-//		auto & gd = loc_grid.get(sub_id_dst);
-
-		while (sub_src.isNext())
-		{
-			// write the object in the last element
-			object_s_di_op<op,decltype(grid_src.get_o(sub_src.get())),decltype(this->get_o(sub_dst.get())),OBJ_ENCAP,prp...>(grid_src.get_o(sub_src.get()),this->get_o(sub_dst.get()));
-
-			++sub_src;
-			++sub_dst;
-		}*/
 	}
 
 	/*! \brief Resize the grid
diff --git a/src/Grid/grid_common.hpp b/src/Grid/grid_common.hpp
index 8bfc8a56dd7de59ad08d3b76d982e8638b59b351..d3a390247b434d2d1253a483cefb9925bbe61da6 100644
--- a/src/Grid/grid_common.hpp
+++ b/src/Grid/grid_common.hpp
@@ -11,6 +11,33 @@
 #include <type_traits>
 #include "util/tokernel_transformation.hpp"
 
+/*! \brief this class is a functor for "for_each" algorithm
+ *
+ * This class is a functor for "for_each" algorithm. For each
+ * element of the boost::vector the operator() is called.
+ * Is mainly used to call hostToDevice for each properties
+ *
+ */
+template<typename aggrT_src, typename local_grids_type>
+struct setBackground_impl
+{
+	aggrT_src & bck;
+
+	local_grids_type loc_grid;
+
+	inline setBackground_impl(aggrT_src & bck, local_grids_type & loc_grid)
+	:bck(bck),loc_grid(loc_grid)
+	{};
+
+	//! It call the copy function for each property
+	template<typename T>
+	inline void operator()(T& t)
+	{
+		for (size_t i = 0 ; i < loc_grid.size() ; i++)
+		{loc_grid.get(i).template setBackgroundValue<T::value>(bck.template get<T::value>());}
+	}
+};
+
 /*! \brief this class is a functor for "for_each" algorithm
  *
  * This class is a functor for "for_each" algorithm. For each
diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp
index 789f8c5938ee4545a06b90b4914545d14f639fad..c24376add69c810acff3a948e9f290de18e766e5 100644
--- a/src/Grid/grid_key.hpp
+++ b/src/Grid/grid_key.hpp
@@ -445,9 +445,10 @@ public:
 	 * \return a point unsigned long int
 	 *
 	 */
-	Point<dim,size_t> toPoint() const
+	template<typename typeT = size_t>
+	inline Point<dim,typeT> toPoint() const
 	{
-		Point<dim,size_t> p;
+		Point<dim,typeT> p;
 
 		for (size_t i = 0; i < dim ; i++)
 		{
@@ -478,7 +479,7 @@ public:
 	 * \return the index value
 	 *
 	 */
-	__device__ __host__  mem_id get(size_t i) const
+	__device__ __host__  index_type get(index_type i) const
 	{
 		return k[i];
 	}
@@ -491,7 +492,7 @@ public:
 	 * \param id value to set
 	 *
 	 */
-	__device__ __host__   void set_d(size_t i, mem_id id)
+	__device__ __host__   void set_d(index_type i, index_type id)
 	{
 #if defined(SE_CLASS1) && !defined(__NVCC__)
 
diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index c888410d128df097749e0c1d37ae4fbe5c3fd5c8..881c6f7ece957a6c8181f7589833fcc0b91d244a 100755
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -859,10 +859,10 @@ ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<d
 	{ig.thr.x = (key2.get(0) - key1.get(0) + 1);}
 
 	if (dim >= 2 && ig.wthr.y == 1)
-	{ig.wthr.y = key2.get(1) - key1.get(1) + 1;}
+	{ig.thr.y = key2.get(1) - key1.get(1) + 1;}
 
 	if (dim == 3 && ig.wthr.z == 1)
-	{ig.wthr.z = key2.get(2) - key1.get(2) + 1;}
+	{ig.thr.z = key2.get(2) - key1.get(2) + 1;}
 
 	for (size_t i = 0 ; i < dim ; i++)
 	{
diff --git a/src/Grid/iterators/grid_key_dx_iterator_sub.hpp b/src/Grid/iterators/grid_key_dx_iterator_sub.hpp
index 4d59803a23da3b5725a8a430ae62976cd44008a5..aeabf55097c7bc8486b78857d377422b820ffaf9 100644
--- a/src/Grid/iterators/grid_key_dx_iterator_sub.hpp
+++ b/src/Grid/iterators/grid_key_dx_iterator_sub.hpp
@@ -484,7 +484,7 @@ public:
 	{
 #ifdef SE_CLASS1
 		if (initialized == false)
-		{std::cerr << "Error: " << __FILE__ << __LINE__ << " using unitialized iterator" << "\n";}
+		{std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " using unitialized iterator" << "\n";}
 #endif
 
 		for (size_t i = 0 ; i < nsteps ; i++)
@@ -506,7 +506,7 @@ public:
 	{
 #ifdef SE_CLASS1
 		if (initialized == false)
-		{std::cerr << "Error: " << __FILE__ << __LINE__ << " using unitialized iterator" << "\n";}
+		{std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " using unitialized iterator" << "\n";}
 #endif
 
 		if (this->gk.get(dim-1) <= gk_stop.get(dim-1))
@@ -643,6 +643,17 @@ public:
 	{
 		this->stl_code.private_adjust(tot_add);
 	}
+
+	/* \brief Set the iterator in a way that isNext return false
+	 *
+	 */
+	void invalidate()
+	{
+		this->gk.set_d(dim-1,1);
+		this->gk_stop.set_d(dim-1,0);
+
+		initialized = true;
+	}
 };
 
 
diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp
index a3b0e4e535f6c9e662fb36d3d95e7776a1552ce8..e756767914387a6d456fe281fa55b74aeb650e38 100755
--- a/src/Grid/map_grid.hpp
+++ b/src/Grid/map_grid.hpp
@@ -337,6 +337,18 @@ public:
 		return background;
 	}
 
+	/*! \brief Set the background value
+	 *
+	 * \tparam p property to set
+	 *
+	 */
+	template<unsigned int p>
+	void setBackgroundValue(const typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type & val)
+	{
+		background.template get<p>() = val;
+	}
+
+
 	/*! \brief assign operator
 	 *
 	 * \return itself
diff --git a/src/SparseGrid/SparseGrid.hpp b/src/SparseGrid/SparseGrid.hpp
index 8780186473f6ee6b0ecc8d5050f3678d74a017f5..cc3572c52c4322ec39be91cfb7b1c4eed8bba2a4 100644
--- a/src/SparseGrid/SparseGrid.hpp
+++ b/src/SparseGrid/SparseGrid.hpp
@@ -586,7 +586,7 @@ public:
 	template<unsigned int p>
 	void setBackgroundValue(const typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type & val)
 	{
-		return background.template get<p>() = val;
+		background.template get<p>() = val;
 	}
 
 	/*! \brief Get the background value
diff --git a/src/SparseGridGpu/BlockMapGpu.hpp b/src/SparseGridGpu/BlockMapGpu.hpp
index 706f378a89a606a5362b742beb45050601e97c3f..e8507ae0c199012c52db6b71e4b5f3acfdf78dbe 100644
--- a/src/SparseGridGpu/BlockMapGpu.hpp
+++ b/src/SparseGridGpu/BlockMapGpu.hpp
@@ -30,6 +30,16 @@ class BlockMapGpu
 private:
     typedef BlockTypeOf<AggregateBlockT, 0> BlockT0;
 
+#ifdef SE_CLASS1
+
+    //! Indicate if the setGPUInsertBuffer has been called
+    bool is_setGPUInsertBuffer = false;
+
+    //! Indicate if the initializeGPUInsertBuffer has been called
+    bool is_initializeGPUInsertBuffer = false;
+
+#endif
+
 protected:
     const static unsigned char EXIST_BIT = 0;
     typedef typename AggregateAppend<DataBlock<unsigned char, BlockT0::size>, AggregateBlockT>::type AggregateInternalT;
@@ -42,7 +52,6 @@ protected:
 public:
     typedef AggregateBlockT AggregateType;
 
-public:
     BlockMapGpu() = default;
 
 	/*! \brief Get the background value
@@ -133,12 +142,49 @@ public:
 
     void hostToDevice();
 
-    void setGPUInsertBuffer(int nBlock, int nSlot);
+    /*! \Brief Before inser any element you have to call this function to initialize the insert buffer
+     *
+     * \param nBlock number of blocks the insert buffer has
+     * \param nSlot maximum number of insertion each thread block does
+     *
+     */
+    void setGPUInsertBuffer(int nBlock, int nSlot)
+    {
+        // Prealloc the insert buffer on the underlying sparse vector
+        blockMap.setGPUInsertBuffer(nBlock, nSlot);
+        initializeGPUInsertBuffer();
 
-    void initializeGPUInsertBuffer();
+#ifdef SE_CLASS1
+        is_setGPUInsertBuffer = true;
+#endif
+    }
+
+    void initializeGPUInsertBuffer()
+    {
+        //todo: Test if it's enough to just initialize masks to 0, without any background value
+        // Initialize the blocks to background
+        auto & insertBuffer = blockMap.getGPUInsertBuffer();
+        typedef BlockTypeOf<AggregateInternalT, pMask> BlockType; // Here assuming that all block types in the aggregate have the same size!
+        constexpr unsigned int chunksPerBlock = threadBlockSize / BlockType::size; // Floor is good here...
+        BlockMapGpuKernels::initializeInsertBuffer<pMask, chunksPerBlock> <<< insertBuffer.size()/chunksPerBlock, chunksPerBlock*BlockType::size >>>(
+                insertBuffer.toKernel());
+
+    #ifdef SE_CLASS1
+            is_initializeGPUInsertBuffer = true;
+    #endif
+    }
 
     template<typename ... v_reduce>
-    void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST);
+    void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
+    {
+#ifdef SE_CLASS1
+
+    	if (is_setGPUInsertBuffer == false || is_initializeGPUInsertBuffer == false)
+    	{std::cout << __FILE__ << ":" << __LINE__ << " error setGPUInsertBuffer you must call before doing any insertion " << std::endl;}
+#endif
+
+        blockMap.template flush<v_reduce ... >(context, opt);
+    }
 
     template<unsigned int p>
     void setBackgroundValue(ScalarTypeOf<AggregateBlockT, p> backgroundValue);
@@ -234,33 +280,6 @@ void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::hostToD
     blockMap.template hostToDevice<pMask>();
 }
 
-template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
-void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::setGPUInsertBuffer(int nBlock, int nSlot)
-{
-    // Prealloc the insert buffer on the underlying sparse vector
-    blockMap.setGPUInsertBuffer(nBlock, nSlot);
-    initializeGPUInsertBuffer();
-}
-
-template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
-void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::initializeGPUInsertBuffer()
-{
-    //todo: Test if it's enough to just initialize masks to 0, without any background value
-    // Initialize the blocks to background
-    auto & insertBuffer = blockMap.getGPUInsertBuffer();
-    typedef BlockTypeOf<AggregateInternalT, pMask> BlockType; // Here assuming that all block types in the aggregate have the same size!
-    constexpr unsigned int chunksPerBlock = threadBlockSize / BlockType::size; // Floor is good here...
-    BlockMapGpuKernels::initializeInsertBuffer<pMask, chunksPerBlock> <<< insertBuffer.size()/chunksPerBlock, chunksPerBlock*BlockType::size >>>(
-            insertBuffer.toKernel());
-}
-
-template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
-template<typename ... v_reduce>
-void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::flush(mgpu::ofp_context_t &context, flush_type opt)
-{
-    blockMap.template flush<v_reduce .../*, sBitwiseOr_<pMask>*/>(context, opt);
-}
-
 template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
 template<unsigned int p>
 void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::setBackgroundValue(
diff --git a/src/SparseGridGpu/Iterators/SparseGridGpu_iterator.hpp b/src/SparseGridGpu/Iterators/SparseGridGpu_iterator.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..329ac5830baafffcf213410e2699bc4b775a0445
--- /dev/null
+++ b/src/SparseGridGpu/Iterators/SparseGridGpu_iterator.hpp
@@ -0,0 +1,201 @@
+/*
+ * SparseGridGpu_iterator.hpp
+ *
+ *  Created on: Sep 4, 2019
+ *      Author: i-bird
+ */
+
+#ifndef SPARSEGRIDGPU_ITERATOR_HPP_
+#define SPARSEGRIDGPU_ITERATOR_HPP_
+
+/*! \brief Element index contain a data chunk index and a point index
+ *
+ * \tparam SparseGridGpu type
+ *
+ */
+template<typename SparseGridGpu_type>
+class sparse_grid_gpu_index
+{
+	//! chunk position id
+	int cnk_pos_id;
+
+	//! data id
+	int data_id;
+
+	//! SparseGridGpu used to add functionalities
+	const SparseGridGpu_type & sparseGrid;
+
+public:
+
+	/*! \brief Constructor from SparseGridGpu
+	 *
+	 *
+	 */
+	inline sparse_grid_gpu_index(const SparseGridGpu_type & sparseGrid, int cnk_pos_id, int data_id)
+	:sparseGrid(sparseGrid),cnk_pos_id(cnk_pos_id),data_id(data_id)
+	{}
+
+	/*! \brief Convert to a point this index
+	 *
+	 * \see toPointS
+	 *
+	 * \return a point unsigned long int
+	 *
+	 */
+	inline Point<SparseGridGpu_type::dims,size_t> toPoint() const
+	{
+		auto indexCnk = sparseGrid.private_get_index_array().template get<0>(cnk_pos_id);
+
+		auto coord = sparseGrid.getCoord(indexCnk*sparseGrid.getBlockSize() + data_id);
+
+		Point<SparseGridGpu_type::dims,size_t> p;
+
+		for (size_t i = 0; i < SparseGridGpu_type::dims ; i++)
+		{
+			p.get(i) = coord.get(i);
+		}
+
+		return p;
+	}
+
+	/*! \brief Get chunk position id
+	 *
+	 * Return the position of the chunk in the chunks array \see SparseGridGpu \see private_get_data_array
+	 *
+	 * \return Get chunk position id
+	 *
+	 */
+	int get_cnk_pos_id() const
+	{
+		return cnk_pos_id;
+	}
+
+	/*! \brief Get chunk local index (the returned index < getblockSize())
+	 *
+	 * \return Get chunk position id
+	 *
+	 */
+	int get_data_id() const
+	{
+		return data_id;
+	}
+};
+
+template<unsigned int dim, typename SparseGridType>
+class SparseGridGpu_iterator
+{
+	//! actual chunk
+	unsigned int chunk;
+
+	//! actual point inside the chunk
+	unsigned int pnt;
+
+	grid_key_dx<dim> a_cnk;
+
+	//! original SparseGrid
+	const SparseGridType & sparseGrid;
+
+	//! array type for the indexes
+	typedef typename std::remove_reference<decltype(sparseGrid.private_get_index_array())>::type index_array_type;
+
+	//! array type for the data
+	typedef typename std::remove_reference<decltype(sparseGrid.private_get_data_array())>::type data_array_type;
+
+	//! vector of the chunk indexes
+	const decltype(sparseGrid.private_get_index_array()) & ids;
+
+	//! vector containing each chunks datas
+	const decltype(sparseGrid.private_get_data_array()) & data;
+
+	//Get the chunk type
+	typedef typename boost::mpl::at<typename data_array_type::value_type::type,boost::mpl::int_<0>>::type chunk_type;
+
+	//Get the chunk type
+	typedef boost::mpl::int_<boost::mpl::size<typename data_array_type::value_type::type>::type::value-1> pMask;
+
+	//! Select the first valid point chunk
+	void SelectValid()
+	{
+		while (pnt < chunk_type::size && data.template get<pMask::value>(chunk)[pnt] == 0)
+		{
+			pnt++;
+		}
+
+		while (pnt == chunk_type::size && chunk < ids.size())
+		{
+			chunk++;
+			pnt = 0;
+			while (pnt < chunk_type::size && data.template get<pMask::value>(chunk)[pnt] == 0)
+			{
+				pnt++;
+			}
+		}
+	}
+
+public:
+
+	/*! \brief Constructor
+	 *
+	 * \param ids vector of ids
+	 * \para data vector of chunk data
+	 *
+	 */
+	inline SparseGridGpu_iterator(const SparseGridType & sparseGrid)
+	:chunk(0),
+	 pnt(0),
+	 sparseGrid(sparseGrid),
+	 ids(sparseGrid.private_get_index_array()),
+	 data(sparseGrid.private_get_data_array())
+	{
+		SelectValid();
+	}
+
+	/*! \brief Check if there is the next element
+	 *
+	 * Check if there is the next element
+	 *
+	 * \return true if there is the next, false otherwise
+	 *
+	 */
+	bool isNext() const
+	{
+		return chunk < ids.size();
+	}
+
+	/*! \brief Get the next element
+	 *
+	 * Get the next element
+	 *
+	 * \return the next grid_key
+	 *
+	 */
+	inline SparseGridGpu_iterator<dim,SparseGridType> & operator++()
+	{
+		++pnt;
+
+		if (pnt >= chunk_type::size)
+		{
+			++chunk;
+			pnt = 0;
+		}
+
+		SelectValid();
+
+		return *this;
+	}
+
+	/*! \brief return the actual point
+	 *
+	 * \return the index of the actual point
+	 *
+	 */
+	inline sparse_grid_gpu_index<SparseGridType> get()
+	{
+		sparse_grid_gpu_index<SparseGridType> spgi(sparseGrid,chunk,pnt);
+
+		return spgi;
+	}
+};
+
+
+#endif /* SPARSEGRIDGPU_ITERATOR_HPP_ */
diff --git a/src/SparseGridGpu/Iterators/SparseGridGpu_iterator_sub.hpp b/src/SparseGridGpu/Iterators/SparseGridGpu_iterator_sub.hpp
index a78491b4760e8f37221bc07d62ba638be67a8df7..6c17a3b06a97ba9e2eae6d42bea0b979cae542af 100644
--- a/src/SparseGridGpu/Iterators/SparseGridGpu_iterator_sub.hpp
+++ b/src/SparseGridGpu/Iterators/SparseGridGpu_iterator_sub.hpp
@@ -8,39 +8,220 @@
 #ifndef SPARSEGRIDGPU_ITERATOR_SUB_HPP_
 #define SPARSEGRIDGPU_ITERATOR_SUB_HPP_
 
-template<unsigned int dim/*, typename blockMap_type*/>
+#include "SparseGridGpu_iterator.hpp"
+
+template<unsigned int dim, typename SparseGridType>
 class SparseGridGpu_iterator_sub
 {
+	//! actual chunk
 	unsigned int chunk;
-	unsigned int pnt;
 
-	Box<dim,size_t> sub;
+	//! original SparseGrid
+	const SparseGridType * sparseGrid;
+
+	//! chunk coordinates
+	grid_key_dx<dim,int> chunk_coord;
+
+	//! array type for the indexes
+	typedef typename std::remove_reference<decltype(sparseGrid->private_get_index_array())>::type index_array_type;
+
+	//! array type for the data
+	typedef typename std::remove_reference<decltype(sparseGrid->private_get_data_array())>::type data_array_type;
+
+	//! vector of the chunk indexes
+	const typename std::remove_reference<decltype(sparseGrid->private_get_index_array())>::type * ids;
+
+	//! vector containing each chunks datas
+	const typename std::remove_reference<decltype(sparseGrid->private_get_data_array())>::type * data;
+
+	//Get the chunk type
+	typedef typename boost::mpl::at<typename data_array_type::value_type::type,boost::mpl::int_<0>>::type chunk_type;
+
+	//Get the chunk type
+	typedef boost::mpl::int_<boost::mpl::size<typename data_array_type::value_type::type>::type::value-1> pMask;
+
+	// subset in grid coordinates
+	Box<dim,int> sub_set;
+
+	// subset in local chunk coordinates
+	Box<dim,int> res;
+
+	// in chunk iterator
+	grid_key_dx_iterator_sub<dim> in_chunk_it;
+
+	// chunk size
+	grid_sm<dim,void> chunk_sz;
+
+	/*! \brief initializr the chunk interator
+	 *
+	 *
+	 */
+	void initialize_chunk_it()
+	{
+		// compute if the chunk intersect the start - stop box
+		chunk_coord = sparseGrid->getCoord(ids->template get<0>(chunk)*sparseGrid->getBlockSize());
+
+		Box<dim,int> box;
+
+		for (int i = 0 ; i < dim ; i++)
+		{
+			box.setLow(i,chunk_coord.get(i));
+			box.setHigh(i,chunk_coord.get(i) + sparseGrid->getBlockEdgeSize() - 1);
+		}
 
-//	blockMap_type & blockMap;
+
+		if (sub_set.Intersect(box,res) == true)
+		{
+			// remove the offset
+			for (int i = 0 ; i < dim ; i++)
+			{
+				res.setLow(i,res.getLow(i) - chunk_coord.get(i));
+				res.setHigh(i,res.getHigh(i) - chunk_coord.get(i));
+			}
+
+			in_chunk_it.reinitialize(grid_key_dx_iterator_sub<dim>(chunk_sz,res.getKP1(),res.getKP2()));
+
+			while (in_chunk_it.isNext() == true && data->template get<pMask::value>(chunk)[chunk_sz.LinId(in_chunk_it.get())] == 0)
+			{
+				++in_chunk_it;
+			}
+		}
+	}
+
+	//! Select the first valid point chunk
+	void SelectValid()
+	{
+		while (in_chunk_it.isNext() == true && data->template get<pMask::value>(chunk)[chunk_sz.LinId(in_chunk_it.get())] == 0)
+		{
+			++in_chunk_it;
+		}
+
+		while (in_chunk_it.isNext() == false && chunk < ids->size())
+		{
+			chunk++;
+
+			initialize_chunk_it();
+		}
+	}
+
+	/*! \brief Initialize chunk_sz member
+	 *
+	 *
+	 */
+	void initialize_chunk_sz()
+	{
+		size_t sz[dim];
+
+		for (int i = 0 ; i < dim ; i++)
+		{sz[i] = sparseGrid->getBlockEdgeSize();}
+
+		chunk_sz.setDimensions(sz);
+	}
 
 public:
 
+	/*! \brief Default constructor
+	 *
+	 */
+	inline SparseGridGpu_iterator_sub()
+	:chunk(0),
+	 sparseGrid(NULL),
+	 ids(NULL),
+	 data(NULL)
+	{
+		initialize_chunk_sz();
+	}
+
+	/*! \brief Constructor
+	 *
+	 * \param sparseGrid original sparse grid
+	 * \param start starting point
+	 * \param stop stop point
+	 *
+	 */
+	inline SparseGridGpu_iterator_sub(const SparseGridType & sparseGrid,const grid_key_dx<dim> & start,const  grid_key_dx<dim> & stop)
+	:chunk(0),
+	 sparseGrid(&sparseGrid),
+	 ids(&sparseGrid.private_get_index_array()),
+	 data(&sparseGrid.private_get_data_array())
+	{
+		for (int i = 0; i < dim ; i++)
+		{
+			sub_set.setLow(i,start.get(i));
+			sub_set.setHigh(i,stop.get(i));
+		}
+
+		initialize_chunk_sz();
+		in_chunk_it.invalidate();
+		initialize_chunk_it();
+
+		SelectValid();
+	}
+
 	/*! \brief Reinitialize the iterator
 	 *
-	 * it re-initialize the iterator with the passed grid_key_dx_iterator_sub
-	 * the actual position of the grid_key_dx_iterator_sub is ignored
+	 * \param it_sub subiterator
+	 *
+	 */
+	inline void reinitialize(const SparseGridGpu_iterator_sub<dim,SparseGridType> & it_sub)
+	{
+		this->operator=(it_sub);
+	}
+
+	/*! \brief Check if there is the next element
+	 *
+	 * Check if there is the next element
 	 *
-	 * \param g_s_it grid_key_dx_iterator_sub
+	 * \return true if there is the next, false otherwise
 	 *
 	 */
-	inline void reinitialize(const SparseGridGpu_iterator_sub & g_s_it)
+	bool isNext() const
 	{
-		// Reinitialize the iterator
-		chunk = g_s_it.chunk;
-		pnt = g_s_it.pnt;
+		return chunk < ids->size();
 	}
 
+	/*! \brief Get the next element
+	 *
+	 * Get the next element
+	 *
+	 * \return the next grid_key
+	 *
+	 */
+	inline SparseGridGpu_iterator_sub<dim,SparseGridType> & operator++()
+	{
+		++in_chunk_it;
 
-};
+		SelectValid();
 
-class SparseGridGpu_iterator
-{
+		return *this;
+	}
+
+	/*! \brief return the actual point
+	 *
+	 * \return the index of the actual point
+	 *
+	 */
+	inline sparse_grid_gpu_index<SparseGridType> get() const
+	{
+		sparse_grid_gpu_index<SparseGridType> spgi(*sparseGrid,chunk,chunk_sz.LinId(in_chunk_it.get()));
 
+		return spgi;
+	}
+
+	SparseGridGpu_iterator_sub<dim,SparseGridType> & operator=(const SparseGridGpu_iterator_sub<dim,SparseGridType> & it_sub)
+	{
+		chunk = it_sub.chunk;
+		sparseGrid = it_sub.sparseGrid;
+		chunk_coord = it_sub.chunk_coord;
+		ids = it_sub.ids;
+		data = it_sub.data;
+		sub_set = it_sub.sub_set;
+		res = it_sub.res;
+		in_chunk_it.reinitialize(it_sub.in_chunk_it);
+		chunk_sz = it_sub.chunk_sz;
+
+		return *this;
+	}
 };
 
 
diff --git a/src/SparseGridGpu/SparseGridGpu.hpp b/src/SparseGridGpu/SparseGridGpu.hpp
index c5da1021909a8ec8270fa86b711e5122cfffe363..20775c1263c5e158c98bb930e81cb24e4349648d 100644
--- a/src/SparseGridGpu/SparseGridGpu.hpp
+++ b/src/SparseGridGpu/SparseGridGpu.hpp
@@ -14,6 +14,7 @@
 #include "Iterators/SparseGridGpu_iterator_sub.hpp"
 #include "Geometry/grid_zmb.hpp"
 #include "util/stat/common_statistics.hpp"
+#include "Iterators/SparseGridGpu_iterator.hpp"
 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)
 #include "VTKWriter/VTKWriter.hpp"
 #endif
@@ -364,11 +365,14 @@ private:
 
     openfpm::vector_gpu<aggregate<indexT>> nn_blocks;
 
+    typedef SparseGridGpu<dim,AggregateT,blockEdgeSize,threadBlockSize,indexT,layout_base,linearizer> self;
+
 protected:
     static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
     typedef AggregateBlockT AggregateInternalT;
 
 public:
+
     static constexpr unsigned int dims = dim;
     static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
 
@@ -376,7 +380,7 @@ public:
 
     template<typename Tfunc> using layout_mfunc = memory_traits_inte<Tfunc>;
 
-    typedef grid_key_dx<dim, indexT> base_key;
+    typedef sparse_grid_gpu_index<self> base_key;
 
     /*! \brief return the size of the grid
      *
@@ -385,7 +389,7 @@ public:
      */
     inline size_t size() const
     {
-        return gridGeometry.size();
+        return this->countExistingElements();
     }
 
     /*! \brief This is a meta-function return which type of sub iterator a grid produce
@@ -394,9 +398,9 @@ public:
      *
      */
     template <typename stencil = no_stencil>
-    static SparseGridGpu_iterator_sub<dim> type_of_subiterator()
+    static SparseGridGpu_iterator_sub<dim,self> type_of_subiterator()
     {
-        return SparseGridGpu_iterator_sub<dim>();
+        return SparseGridGpu_iterator_sub<dim,self>();
     }
 
     /*! \brief This is a meta-function return which type of iterator a grid produce
@@ -404,9 +408,9 @@ public:
      * \return the type of the sub-grid iterator
      *
      */
-    static SparseGridGpu_iterator type_of_iterator()
+    static SparseGridGpu_iterator<dim,self> type_of_iterator()
     {
-        return SparseGridGpu_iterator();
+        return SparseGridGpu_iterator<dim,self>(std::declval<self>());
     }
 
     template<typename dim3T>
@@ -647,6 +651,8 @@ private:
 
 public:
 
+    typedef AggregateT value_type;
+
     SparseGridGpu()
 	:stencilSupportRadius(1)
     {};
@@ -788,7 +794,7 @@ public:
         return gridGeometry.LinId(coord);
     }
 
-    inline grid_key_dx<dim, int> getCoord(size_t linId)
+    inline grid_key_dx<dim, int> getCoord(size_t linId) const
     {
         return gridGeometry.InvLinId(linId);
     }
@@ -798,20 +804,64 @@ public:
     	return gridSize.getGPUIterator(start,stop,n_thr);
     }
 
-    // Data management methods
 
+    /*! \brief Get an element using the point coordinates
+     *
+     * \tparam p property index
+     *
+     * \param coord point coordinates
+     *
+     * \return the element
+     *
+     */
     template<unsigned int p, typename CoordT>
-    auto get(const CoordT &coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
+    auto get(const grid_key_dx<dim,CoordT> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
     {
         return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::template get<p>(gridGeometry.LinId(coord));
     }
 
+    /*! \brief Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
+     *
+     * \tparam p property index
+     *
+     * \param element
+     *
+     * \return the element
+     *
+     */
+    template<unsigned int p>
+    auto get(const sparse_grid_gpu_index<self> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
+    {
+        return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
+    }
+
+    /*! \brief Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
+     *
+     * \tparam p property index
+     *
+     * \param element
+     *
+     * \return the element
+     *
+     */
+    template<unsigned int p>
+    auto get(const sparse_grid_gpu_index<self> & coord) -> ScalarTypeOf<AggregateBlockT, p> &
+    {
+        return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
+    }
+
     template<unsigned int p, typename CoordT>
     auto insert(const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
     {
         return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::template insert<p>(gridGeometry.LinId(coord));
     }
 
+    /*! \Brief Before inser any element you have to call this function to initialize the insert buffer
+     *
+     * \param nBlock number of blocks the insert buffer has
+     * \param nSlot maximum number of insertion each thread block does
+     *
+     */
     template<typename dim3T>
     void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
     {
@@ -904,7 +954,7 @@ public:
                 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
     }
 
-    size_t countExistingElements()
+    size_t countExistingElements() const
     {
         // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
         auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
@@ -1143,6 +1193,8 @@ public:
         ::unsetBit(bitMask, PADDING_BIT);
     }
 
+
+
     /*! \brief Insert the point on host side and flush directly
      *
      * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again
@@ -1195,6 +1247,46 @@ public:
     	}
     }
 
+    /*! \brief Return a SparseGrid iterator
+     *
+     * \return a SparseGrid iterator
+     *
+     */
+    decltype(self::type_of_iterator()) getIterator() const
+    {
+    	return decltype(self::type_of_iterator())(*this);
+    }
+
+    /*! \brief Return a SparseGrid iterator only on a sub-set of elements
+     *
+     * \return a SparseGrid iterator on a subset of elements
+     *
+     */
+    decltype(self::type_of_subiterator()) getIterator(const grid_key_dx<dim> & start, const grid_key_dx<dim> & stop) const
+    {
+    	return decltype(self::type_of_subiterator())(*this,start,stop);
+    }
+
+    /*! \brief Return the index array of the blocks
+     *
+     * \return the index arrays of the blocks
+     *
+     */
+    auto private_get_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer()) &
+    {
+    	return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
+    }
+
+    /*! \brief Return the index array of the blocks
+     *
+     * \return the index arrays of the blocks
+     *
+     */
+    auto private_get_data_array() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer()) &
+    {
+    	return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
+    }
+
     /*! \brief Return the index array of the blocks
      *
      * \return the index arrays of the blocks
@@ -1210,7 +1302,7 @@ public:
      * \return the index arrays of the blocks
      *
      */
-    auto private_get_data_array() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer())
+    auto private_get_data_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer()) &
     {
     	return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
     }
diff --git a/src/SparseGridGpu/SparseGridGpu_ker.cuh b/src/SparseGridGpu/SparseGridGpu_ker.cuh
index a9d168f069315674c04e76988a6cb3b42dfcb225..929fbee1bf0439bd0429842a21420d17b5f0b11c 100644
--- a/src/SparseGridGpu/SparseGridGpu_ker.cuh
+++ b/src/SparseGridGpu/SparseGridGpu_ker.cuh
@@ -82,6 +82,34 @@ public:
 
     }
 
+    /*! \brief Given a point to insert, return the block-id and offset of that point
+     *
+     * \param p Point the thread is processing
+     * \param blk Block id the point is falling into
+     * \param offset linearized index within the block
+     *
+     * \return true if the point is inactive
+     *
+     */
+    template<typename ite_type>
+    inline __device__ bool getInsertBlockOffset(const ite_type & itd, const grid_key_dx<dim, int> & p, grid_key_dx<dim, int> & blk, int & offset)
+    {
+    	int accu = 1;
+    	offset = 0;
+
+    	bool active = true;
+
+    	for (int i = 0 ; i < dim ; i++)
+    	{
+    		blk.set_d(i,p.get(i) / getBlockEdgeSize());
+    		offset += (p.get(i) % getBlockEdgeSize()) * accu;
+    		accu *= getBlockEdgeSize();
+    		active = active && (p.get(i) >= (itd.start.get(i) + itd.start_base.get(i))) && (p.get(i) <= itd.stop.get(i));
+    	}
+
+    	return active;
+    }
+
     template<typename CoordT>
     inline __device__ size_t getBlockLinId(CoordT blockCoord) const
     {
diff --git a/src/SparseGridGpu/performance/Stencil_performance_tests.cu b/src/SparseGridGpu/performance/Stencil_performance_tests.cu
index f037187abecaff81cb35ba89134141630bc325c6..595fd4dcc233d12c99ed49b3716b7fd08c32b8a5 100644
--- a/src/SparseGridGpu/performance/Stencil_performance_tests.cu
+++ b/src/SparseGridGpu/performance/Stencil_performance_tests.cu
@@ -27,18 +27,6 @@ report_sparse_grid_tests report_sparsegrid_funcs;
 std::string suiteURI = "performance.SparseGridGpu";
 std::set<std::string> testSet;
 
-//// 2D params
-//const unsigned int minSpatialEdgeSize2D = 512;
-//const unsigned int maxSpatialEdgeSize2D = 8192;
-//const unsigned int minBlockEdgeSize2D = 2;
-//const unsigned int maxBlockEdgeSize2D = 32;
-//// 3D params
-//const unsigned int minSpatialEdgeSize3D = 64;
-//const unsigned int maxSpatialEdgeSize3D = 512;
-//const unsigned int minBlockEdgeSize3D = 2;
-//const unsigned int maxBlockEdgeSize3D = 8;
-////
-
 struct Fixture
 {
     Fixture()
@@ -1146,6 +1134,7 @@ BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
 BOOST_AUTO_TEST_CASE(testConv3x3x3_gridScaling)
 {
     std::string testURI = suiteURI + ".device.conv3x3x3.sparse.N.3D.gridScaling";
+    unsigned int counter = 0;
     launch_testConv3x3x3_perf(testURI, counter++);
 
     testSet.insert(testURI);
diff --git a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu
index cd0f57bd9e7ed39ef21dbd4f9e842faca53db901..ec58f5d8cdcfe3223bba2139521c3bf277c5caad 100644
--- a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu
+++ b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu
@@ -160,15 +160,6 @@ __global__ void copyBlocksToOutput(SparseGridType sparseGrid, VectorOutType outp
 
     auto value = sparseGrid.template get<p>(coord);
 
-//    printf("copyBlocksToOutput: bDim=(%d,%d), bId=(%d,%d), tId=(%d,%d) : "
-//           "pos=%ld, coord={%d,%d}, value=%d\n",
-//           bDimX, bDimY,
-//           bIdX, bIdY,
-//           tIdX, tIdY,
-//           pos,
-//           x, y,
-//           static_cast<int>(value)); //debug
-
     output.template get<p>(pos) = value;
 
     // Compiler avoid warning
@@ -331,12 +322,9 @@ struct HeatStencil
                 SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value;
 
         __shared__ ScalarT enlargedBlock[enlargedBlockSize];
-//        sparseGrid.loadBlock<p>(dataBlockLoad, enlargedBlock);
-//        sparseGrid.loadGhost<p>(dataBlockId, enlargedBlock);
 
         sparseGrid.loadGhostBlock<p_src>(dataBlockLoad,dataBlockIdPos,enlargedBlock);
 
-//        sparseGrid.loadGhost<p>(dataBlockId, nullptr, enlargedBlock);
         __syncthreads();
 
         if (applyStencilHere)
@@ -440,18 +428,6 @@ struct HeatStencil2
             ScalarT laplacian = -2.0 * dim * cur; // The central part of the stencil
             for (int d = 0; d < dim; ++d)
             {
-//                const auto boundary = boundaryDirection[d];
-//                ScalarT neighbourPlus = enlargedBlock[nPlusId[d]];
-//                ScalarT neighbourMinus = enlargedBlock[nMinusId[d]];
-//                if (boundary == 1)
-//                {
-//                    neighbourPlus = *(nPlus[d]);
-//                }
-//                else if (boundary == -1)
-//                {
-//                    neighbourMinus = *(nMinus[d]);
-//                }
-//                laplacian += neighbourMinus + neighbourPlus;
                 laplacian += *(nMinus[d]) + *(nPlus[d]);
             }
             enlargedBlock[linId] = cur + dt * laplacian;
@@ -518,7 +494,6 @@ BOOST_AUTO_TEST_CASE(testInsert)
 	bool match = true;
 	for (size_t i = 0; i < output.size(); i++)
 	{
-//            auto expectedValue = (i < gridSize * blockSizeInsert) ? i : 666;
 		auto coord = sparseGrid.getCoord(i);
 		auto expectedValue = coord.get(0);
 
@@ -572,17 +547,9 @@ BOOST_AUTO_TEST_CASE(testInsert3D)
 	bool match = true;
 	for (size_t i = 0; i < output.size(); i++)
 	{
-//            auto expectedValue = (i < gridSize * blockSizeInsert) ? i : 666;
 		auto coord = sparseGrid.getCoord(i);
 		auto expectedValue = coord.get(0);
 
-//            std::cout << "sparseGrid(" << coord.get(0) << "," << coord.get(1) << "," << coord.get(2) << ") = "
-//                      << sparseGrid.template get<0>(coord)
-//                      << " == "
-//                      << expectedValue
-//                      << " == "
-//                      << output.template get<0>(i) << " = output(" << i << ")"
-//                      << std::endl;
 		match &= output.template get<0>(i) == sparseGrid.template get<0>(coord);
 		match &= output.template get<0>(i) == expectedValue;
 	}
@@ -751,9 +718,7 @@ BOOST_AUTO_TEST_CASE(testTagBoundaries2)
 	}
 	///////
 
-//        sparseGrid.hostToDevice(); //just sync masks
 	sparseGrid.deviceToHost(); //just sync masks
-//        sparseGrid.deviceToHost<0>();
 
 	sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
 	// Now tag the boundaries
@@ -815,10 +780,6 @@ BOOST_AUTO_TEST_CASE(testStencilHeat)
 	insertConstantValue<0> << < gridSize, blockSizeInsert >> > (sparseGrid.toKernel(), 0);
 	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
-//	sparseGrid.setGPUInsertBuffer(gridSize, blockSizeInsert);
-//	insertBoundaryValuesHeat<0> << < gridSize, blockSizeInsert >> > (sparseGrid.toKernel());
-//	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
-
 	sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
 	sparseGrid.tagBoundaries();
 
@@ -837,10 +798,6 @@ BOOST_AUTO_TEST_CASE(testStencilHeat)
         cudaDeviceSynchronize();
         sparseGrid.applyStencils<HeatStencil<dim, 1, 0>>(STENCIL_MODE_INPLACE, 0.1);
         cudaDeviceSynchronize();
-//        // DEBUG //
-//        sparseGrid.deviceToHost<0,1>();
-//        sparseGrid.write("test_heat_stencil"+std::to_string(iter)+".vtk");
-//        ////
 	}
 
 	sparseGrid.deviceToHost<0,1>();
@@ -901,10 +858,6 @@ BOOST_AUTO_TEST_CASE(testStencilHeatInsert)
 	CUDA_LAUNCH_DIM3((insertBoundaryValuesHeat<0>),gridSize, blockSizeInsert,sparseGrid.toKernel());
 	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
-//	sparseGrid.setGPUInsertBuffer(gridSize, blockSizeInsert);
-//	CUDA_LAUNCH_DIM3((insertBoundaryValuesHeat<1>),gridSize, blockSizeInsert,sparseGrid.toKernel());
-//	sparseGrid.flush < smax_< 1 >> (ctx, flush_type::FLUSH_ON_DEVICE);
-
 	sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
 
 //        // Now tag the boundaries
@@ -918,7 +871,6 @@ BOOST_AUTO_TEST_CASE(testStencilHeatInsert)
 	for (unsigned int iter=0; iter<maxIter; ++iter)
 	{
 		sparseGrid.applyStencils<HeatStencil<dim, 0,0>>(STENCIL_MODE_INSERT, 0.1);
-//		sparseGrid.applyStencils<HeatStencil<dim, 1,0>>(STENCIL_MODE_INSERT, 0.1);
 	}
 
 
@@ -988,6 +940,18 @@ BOOST_AUTO_TEST_CASE(testFlushInsert)
 	sparseGrid.insertFlush<0>(grid_key_dx<3>({36,86,27})) = 9.0;
 	sparseGrid.insertFlush<0>(grid_key_dx<3>({34,36,7})) = 10.0;
 
+	////// Add points in the same blocks
+
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({4,6,7})) = 2.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({12,16,17})) = 3.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({12,46,27})) = 4.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({35,63,11})) = 5.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({38,96,47})) = 6.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({131,56,37})) = 7.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({132,76,17})) = 8.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({37,86,27})) = 9.0;
+	sparseGrid.insertFlush<0>(grid_key_dx<3>({35,36,7})) = 10.0;
+
 	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({3,6,7})),2.0);
 	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({13,16,17})),3.0);
 	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({13,46,27})),4.0);
@@ -998,6 +962,16 @@ BOOST_AUTO_TEST_CASE(testFlushInsert)
 	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({36,86,27})),9.0);
 	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({34,36,7})),10.0);
 
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({4,6,7})),2.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({12,16,17})),3.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({12,46,27})),4.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({35,63,11})),5.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({38,96,47})),6.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({131,56,37})),7.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({132,76,17})),8.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({37,86,27})),9.0);
+	BOOST_REQUIRE_EQUAL(sparseGrid.get<0>(grid_key_dx<3>({35,36,7})),10.0);
+
 	sparseGrid.template hostToDevice<0>();
 
 	// Check on device I can get information
@@ -1198,122 +1172,221 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput)
 	sparseGrid.write("SparseGridGPU_output.vtk");
 }
 
-    BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3D)
-    {
-        constexpr unsigned int dim = 3;
-        constexpr unsigned int blockEdgeSize = 4;
-        typedef aggregate<float> AggregateT;
+BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3D)
+{
+	constexpr unsigned int dim = 3;
+	constexpr unsigned int blockEdgeSize = 4;
+	typedef aggregate<float> AggregateT;
 
-        size_t sz[3] = {512,512,512};
+	size_t sz[3] = {512,512,512};
 //        dim3 gridSize(128,128,128);
-        dim3 gridSize(32,32,32);
+	dim3 gridSize(32,32,32);
 
-        grid_smb<dim, blockEdgeSize> blockGeometry(sz);
-        SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-        mgpu::ofp_context_t ctx;
-        sparseGrid.template setBackgroundValue<0>(0);
+	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
+	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
+	mgpu::ofp_context_t ctx;
+	sparseGrid.template setBackgroundValue<0>(0);
 
-        grid_key_dx<3,int> start({256,256,256});
+	grid_key_dx<3,int> start({256,256,256});
 
-        // Insert values on the grid
-        sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
-        CUDA_LAUNCH_DIM3((insertSphere3D<0>),
-                gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
-                sparseGrid.toKernel(), start, 64, 56, 1);
-        sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
+	// Insert values on the grid
+	sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
+	CUDA_LAUNCH_DIM3((insertSphere3D<0>),
+			gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
+			sparseGrid.toKernel(), start, 64, 56, 1);
+	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
-        cudaDeviceSynchronize();
+	cudaDeviceSynchronize();
 
-        sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
-        sparseGrid.tagBoundaries();
+	sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
+	sparseGrid.tagBoundaries();
 
-        sparseGrid.template applyBoundaryStencils<BoundaryStencilSetX<dim,0,0>>();
+	sparseGrid.template applyBoundaryStencils<BoundaryStencilSetX<dim,0,0>>();
 
-        cudaDeviceSynchronize();
+	cudaDeviceSynchronize();
 
-        sparseGrid.template deviceToHost<0>();
+	sparseGrid.template deviceToHost<0>();
 
-        sparseGrid.write("SparseGridGPU_output3D.vtk");
-    }
+	sparseGrid.write("SparseGridGPU_output3D.vtk");
 
-    BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3DHeatStencil)
-    {
-        constexpr unsigned int dim = 3;
-        constexpr unsigned int blockEdgeSize = 4;
-        typedef aggregate<float, float> AggregateT;
+	bool test = compare("SparseGridGPU_output3D.vtk","test_data/SparseGridGPU_output3D_test.vtk");
+	BOOST_REQUIRE_EQUAL(true,test);
+}
+
+BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3DHeatStencil)
+{
+	constexpr unsigned int dim = 3;
+	constexpr unsigned int blockEdgeSize = 4;
+	typedef aggregate<float, float> AggregateT;
 
-        size_t sz[3] = {512,512,512};
+	size_t sz[3] = {512,512,512};
 //        dim3 gridSize(128,128,128);
-        dim3 gridSize(32,32,32);
-
-        grid_smb<dim, blockEdgeSize> blockGeometry(sz);
-        SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-        mgpu::ofp_context_t ctx;
-        sparseGrid.template setBackgroundValue<0>(0);
-
-        ///// Insert sparse content, a set of 3 hollow spheres /////
-        // Sphere 1
-        grid_key_dx<3,int> start1({256,256,256});
-        sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
-        CUDA_LAUNCH_DIM3((insertSphere3D<0>),
-                         gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
-                         sparseGrid.toKernel(), start1, 64, 32, 1);
-        cudaDeviceSynchronize();
-        sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
-        cudaDeviceSynchronize();
+	dim3 gridSize(32,32,32);
 
-        // Sphere 2
-        grid_key_dx<3,int> start2({192,192,192});
-        sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
-        CUDA_LAUNCH_DIM3((insertSphere3D<0>),
-                         gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
-                         sparseGrid.toKernel(), start2, 64, 44, 1);
-        cudaDeviceSynchronize();
-        sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
-        cudaDeviceSynchronize();
+	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
+	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
+	mgpu::ofp_context_t ctx;
+	sparseGrid.template setBackgroundValue<0>(0);
 
-        // Sphere 3
-        grid_key_dx<3,int> start3({340,192,192});
-        sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
-        CUDA_LAUNCH_DIM3((insertSphere3D<0>),
-                         gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
-                         sparseGrid.toKernel(), start3, 20, 15, 1);
-        cudaDeviceSynchronize();
-        sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
-        cudaDeviceSynchronize();
-        ///// /////
+	///// Insert sparse content, a set of 3 hollow spheres /////
+	// Sphere 1
+	grid_key_dx<3,int> start1({256,256,256});
+	sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
+	CUDA_LAUNCH_DIM3((insertSphere3D<0>),
+					 gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
+					 sparseGrid.toKernel(), start1, 64, 32, 1);
+	cudaDeviceSynchronize();
+	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
+	cudaDeviceSynchronize();
 
-        sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
-        sparseGrid.tagBoundaries();
+	// Sphere 2
+	grid_key_dx<3,int> start2({192,192,192});
+	sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
+	CUDA_LAUNCH_DIM3((insertSphere3D<0>),
+					 gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
+					 sparseGrid.toKernel(), start2, 64, 44, 1);
+	cudaDeviceSynchronize();
+	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
+	cudaDeviceSynchronize();
 
-        // Now apply some boundary conditions
-        sparseGrid.template applyBoundaryStencils<BoundaryStencilSetXRescaled<dim,0,0>>(
-                192, 384,
-                0.0, 10.0);
-        cudaDeviceSynchronize();
+	// Sphere 3
+	grid_key_dx<3,int> start3({340,192,192});
+	sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
+	CUDA_LAUNCH_DIM3((insertSphere3D<0>),
+					 gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
+					 sparseGrid.toKernel(), start3, 20, 15, 1);
+	cudaDeviceSynchronize();
+	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
+	cudaDeviceSynchronize();
+	///// /////
 
-        // Now apply the laplacian operator
+	sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
+	sparseGrid.tagBoundaries();
+
+	// Now apply some boundary conditions
+	sparseGrid.template applyBoundaryStencils<BoundaryStencilSetXRescaled<dim,0,0>>(
+			192, 384,
+			0.0, 10.0);
+	cudaDeviceSynchronize();
+
+	// Now apply the laplacian operator
 //        const unsigned int maxIter = 1000;
-        const unsigned int maxIter = 100;
-        for (unsigned int iter=0; iter<maxIter; ++iter)
-        {
-            for (int innerIter=0; innerIter<10; ++innerIter)
-            {
-                sparseGrid.applyStencils<HeatStencil<dim, 0, 1>>(STENCIL_MODE_INPLACE, 0.1);
-                cudaDeviceSynchronize();
-                sparseGrid.applyStencils<HeatStencil<dim, 1, 0>>(STENCIL_MODE_INPLACE, 0.1);
-                cudaDeviceSynchronize();
-            }
-            // DEBUG //
-            sparseGrid.deviceToHost<0,1>();
-            sparseGrid.write("SparseGridGPU_output3DHeatStencil_"+std::to_string(iter)+".vtk");
-            ////
-        }
+	const unsigned int maxIter = 100;
+	for (unsigned int iter=0; iter<maxIter; ++iter)
+	{
+		for (int innerIter=0; innerIter<10; ++innerIter)
+		{
+			sparseGrid.applyStencils<HeatStencil<dim, 0, 1>>(STENCIL_MODE_INPLACE, 0.1);
+			cudaDeviceSynchronize();
+			sparseGrid.applyStencils<HeatStencil<dim, 1, 0>>(STENCIL_MODE_INPLACE, 0.1);
+			cudaDeviceSynchronize();
+		}
+	}
 
-        sparseGrid.template deviceToHost<0,1>();
+	sparseGrid.deviceToHost<0,1>();
+	sparseGrid.write("SparseGridGPU_output3DHeatStencil.vtk");
+}
 
-        sparseGrid.write("SparseGridGPU_output3DHeatStencil.vtk");
-    }
+BOOST_AUTO_TEST_CASE(test_sparse_grid_iterator_host)
+{
+	constexpr unsigned int dim = 3;
+	constexpr unsigned int blockEdgeSize = 4;
+	typedef aggregate<float, float> AggregateT;
+
+	size_t sz[3] = {512,512,512};
+//        dim3 gridSize(128,128,128);
+	dim3 gridSize(32,32,32);
+
+	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
+	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
+	mgpu::ofp_context_t ctx;
+	sparseGrid.template setBackgroundValue<0>(0);
+
+	///// Insert sparse content, a set of 3 hollow spheres /////
+	// Sphere 1
+	grid_key_dx<3,int> start1({256,256,256});
+	sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
+	CUDA_LAUNCH_DIM3((insertSphere3D<0>),
+					 gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
+					 sparseGrid.toKernel(), start1, 64, 32, 1);
+
+	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
+
+	sparseGrid.template deviceToHost<0>();
+
+	bool match = true;
+
+	int count = 0;
+
+	auto it = sparseGrid.getIterator();
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		match &= sparseGrid.template get<0>(key) == 1.0;
+		//unsigned char bl = sparseGrid.template get<2>(key);
+
+		count++;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(sparseGrid.countExistingElements(),count);
+	BOOST_REQUIRE_EQUAL(match,true);
+}
+
+BOOST_AUTO_TEST_CASE(test_sparse_grid_iterator_sub_host)
+{
+	constexpr unsigned int dim = 3;
+	constexpr unsigned int blockEdgeSize = 4;
+	typedef aggregate<float, float> AggregateT;
+
+	size_t sz[3] = {768,768,768};
+	dim3 gridSize(32,32,32);
+
+	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
+	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
+	mgpu::ofp_context_t ctx;
+	sparseGrid.template setBackgroundValue<0>(0);
+
+	///// Insert sparse content, a set of 3 hollow spheres /////
+	// Sphere 1
+	grid_key_dx<3,int> start1({256,256,256});
+	sparseGrid.setGPUInsertBuffer(gridSize,dim3(1));
+	CUDA_LAUNCH_DIM3((insertSphere3D<0>),
+					 gridSize, dim3(blockEdgeSize*blockEdgeSize*blockEdgeSize,1,1),
+					 sparseGrid.toKernel(), start1, 32, 0, 1);
+
+	sparseGrid.flush < smax_< 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
+
+	sparseGrid.template deviceToHost<0>();
+
+	bool match = true;
+
+	int count = 0;
+
+	grid_key_dx<3> start({303,303,303});
+	grid_key_dx<3> stop({337,337,337});
+
+	auto it = sparseGrid.getIterator(start,stop);
+
+	while (it.isNext())
+	{
+		auto key = it.get();
+
+		match &= sparseGrid.template get<0>(key) == 1.0;
+
+		sparseGrid.template get<0>(key) = 5.0;
+
+		count++;
+
+		++it;
+	}
+
+	BOOST_REQUIRE_EQUAL(match,true);
+	BOOST_REQUIRE_EQUAL(count,42875);
+}
 
 #endif
 
diff --git a/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh b/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh
index 90446c5b687d368aa7d15f0e0d3039a3d914b7be..43fee90b244a68333853f823678bcb551687ece3 100644
--- a/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh
+++ b/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh
@@ -157,13 +157,13 @@ __global__ void insertSphere3D(SparseGridType sparseGrid, grid_key_dx<3,int> sta
     grid_key_dx<3,int> keyg;
     keyg = sparseGrid.getGlobalCoord(blk,offset);
 
-    const long long x = keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
-    const long long y = keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
-    const long long z = keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
+    const long int x = (long int)keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
+    const long int y = (long int)keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
+    const long int z = (long int)keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
 
     float radius = sqrt((float) (x*x + y*y + z*z));
 
-    bool is_active = radius < r1 && radius > r2;
+    bool is_active = radius < r1 && radius >= r2;
 
     if (is_active == true)
     {
@@ -179,7 +179,6 @@ __global__ void insertSphere3D(SparseGridType sparseGrid, grid_key_dx<3,int> sta
         if ( is_active == true)
         {
             ec.template get<p>()[offset] = value;
-//            ec.template get<p>()[offset] = x;
             BlockMapGpu_ker<>::setExist(ec.template get<pMask>()[offset]);
         }
     }
@@ -215,9 +214,9 @@ __global__ void insertSphere3D_radius(SparseGridType sparseGrid, grid_key_dx<3,i
     grid_key_dx<3,int> keyg;
     keyg = sparseGrid.getGlobalCoord(blk,offset);
 
-    const long long x = keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
-    const long long y = keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
-    const long long z = keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
+    const long int x = (long int)keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
+    const long int y = (long int)keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
+    const long int z = (long int)keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
 
     float radius = sqrt((float) (x*x + y*y + z*z));
 
diff --git a/src/Vector/map_vector_sparse.hpp b/src/Vector/map_vector_sparse.hpp
index 36a88c89d5fcb749324008370651b45f6f4054db..b610e816ef70e33f06677fe5d48342c424e41914 100644
--- a/src/Vector/map_vector_sparse.hpp
+++ b/src/Vector/map_vector_sparse.hpp
@@ -1447,6 +1447,13 @@ namespace openfpm
 		}
 
 	public:
+
+		vector_sparse()
+		:max_ele(0)
+		{
+			vct_data.resize(1);
+		}
+
         /*! \brief Get the indices buffer
         *
         * \return the reference to the indices buffer
@@ -1464,13 +1471,24 @@ namespace openfpm
         {
             return vct_data;
         }
-	public:
 
-		vector_sparse()
-		:max_ele(0)
-		{
-			vct_data.resize(1);
-		}
+        /*! \brief Get the indices buffer
+        *
+        * \return the reference to the indices buffer
+        */
+        auto getIndexBuffer() const -> const decltype(vct_index)&
+        {
+            return vct_index;
+        }
+
+        /*! \brief Get the data buffer
+         *
+         * \return the reference to the data buffer
+         */
+        auto getDataBuffer() const -> const decltype(vct_data)&
+        {
+            return vct_data;
+        }
 
 		/*! \brief Get an element of the vector
 		 *
@@ -1598,7 +1616,7 @@ namespace openfpm
 			if (v == ele)
 			{
 				// block exist
-				return vct_data.template get<p>(ele);
+				return vct_data.template get<p>(di);
 			}
 
 			// It does not exist, we create it di contain the index where we have to create the new block
@@ -1623,7 +1641,7 @@ namespace openfpm
 			if (v == ele)
 			{
 				// block exist
-				return vct_data.get(ele);
+				return vct_data.get(di);
 			}
 
 			// It does not exist, we create it di contain the index where we have to create the new block
diff --git a/test_data/SparseGridGPU_output3D_test.vtk b/test_data/SparseGridGPU_output3D_test.vtk
new file mode 100644
index 0000000000000000000000000000000000000000..e9211f38b7e3669cf668f9f996890a758c092f2e
Binary files /dev/null and b/test_data/SparseGridGPU_output3D_test.vtk differ