From 1c2c55e3f837948f7534899af5ac76aac282c104 Mon Sep 17 00:00:00 2001
From: Incardona Pietro <incardon@mpi-cbg.de>
Date: Mon, 3 Apr 2023 00:58:46 +0200
Subject: [PATCH] Fix for non dense

---
 openfpm_devices                               |   2 +-
 src/Grid/grid_dist_id.hpp                     |  94 ++---
 src/Grid/grid_dist_id_comm.hpp                | 343 +++++++++++++++---
 src/Grid/tests/grid_dist_id_dlb_unit_test.cpp |   6 +
 .../tests/sgrid_dist_id_gpu_unit_tests.cu     | 164 ++++-----
 5 files changed, 426 insertions(+), 183 deletions(-)

diff --git a/openfpm_devices b/openfpm_devices
index fc5ba1035..42f6b016d 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit fc5ba1035ac72a3c083ab1bf8a3cb60639e0c241
+Subproject commit 42f6b016d3d2045fcabf906cf8ea07c36b484fcc
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 0d24f5184..e788fe9cc 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -103,6 +103,28 @@ struct copy_all_prop_sparse
 	}
 };
 
+template<typename T>
+struct variadic_caller
+{};
+
+template<int ... prp>
+struct variadic_caller<index_tuple_sq<prp ...>>
+{
+	template<typename this_type, typename dec_type, typename cd_sm_type, typename loc_grid_type, typename gdb_ext_type>
+	static void call(this_type * this_,
+					 dec_type & dec, 
+					 cd_sm_type & cd_sm,
+					 loc_grid_type & loc_grid, 
+					 loc_grid_type & loc_grid_old, 
+					 gdb_ext_type & gdb_ext, 
+					 gdb_ext_type & gdb_ext_old, 
+					 gdb_ext_type & gdb_ext_global, 
+					 size_t opt)
+	{
+		this_->template map_<prp ...>(dec,cd_sm,loc_grid,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global,opt);
+	}
+};
+
 struct ids_pl
 {
 	size_t id;
@@ -133,61 +155,10 @@ struct device_grid_copy<true>
 		boost::mpl::for_each_ref< boost::mpl::range_c<int,0,decltype(block_data_dst)::max_prop> >(cp);
 	}
 
-/*
- *
- *
-		auto & v_cl = create_vcluster();
-		auto & dg = loc_grid.get(0);
-
-		typedef typename std::remove_reference<decltype(dg)>::type sparse_grid_type;
 
-		grid_key_dx<sparse_grid_type::dims> kp1 = gdb_ext.get(0).Dbox.getKP1();
-
-		for (int j = 0 ; j < std::ceil(gdb_ext_old.size() / 32) + 1 ; j++)
-		{
-			int jbase = j*32;
-                        int sz = gdb_ext_old.size() - jbase;
-			if (sz < 0)	{break;}
-			else if (sz > 32) {sz = 32;}
-			dg.copyRemoveReset();
-
-			std::cout << "JBASE: " << jbase << " SZ: " << sz << std::endl;
-
-                	for (int i = 0 ; i < sz ; i++)
-                	{
-				Box<sparse_grid_type::dims,long int> bx_dst;
-				Box<sparse_grid_type::dims,long int> bx_src;
-                        	for (int k = 0 ; k < sparse_grid_type::dims ; k++)
-                        	{
-					bx_dst.setLow(k,kp1.get(k) + gdb_ext_old.get(i+jbase).origin.get(k) + gdb_ext_old.get(i+jbase).Dbox.getKP1().get(k));
-					bx_dst.setHigh(k,kp1.get(k) + gdb_ext_old.get(i+jbase).origin.get(k) + gdb_ext_old.get(i+jbase).Dbox.getKP2().get(k));
-
-                                        bx_src.setLow(k,gdb_ext_old.get(i+jbase).Dbox.getKP1().get(k));
-                                        bx_src.setHigh(k,gdb_ext_old.get(i+jbase).Dbox.getKP2().get(k));
-                        	}
-
-				std::cout << jbase <<  "  " << i << "  SRC:" << gdb_ext_old.get(i+jbase).Dbox.toString() << "  DST: " << bx_dst.toString() << "  INDEX BUFFER: " << loc_grid_old.get(jbase + i).private_get_index_array().size() << std::endl;
-				loc_grid_old.get(jbase + i).template hostToDevice<0>();
-				dg.copy_to(loc_grid_old.get(jbase + i),gdb_ext_old.get(jbase +i).Dbox,bx_dst);
-			}
 
-			for (int i = 0 ; i < sz ; i++)
-                        {
-				std::cout << "FINALIZE1" << std::endl;
-				loc_grid_old.get(jbase + i).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE1);
-				std::cout << "FINALIZE2" << std::endl;
-				loc_grid_old.get(jbase + i).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE2);
-				std::cout << "FINALIZE3" << std::endl;
-				loc_grid_old.get(jbase + i).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE3);
-			}
-		}
-
- *
- *
- */
-
-        template<typename gdb_ext_type, typename loc_grid_type>
-        static void pre_load(gdb_ext_type & gdb_ext_old, loc_grid_type & loc_grid_old, gdb_ext_type & gdb_ext, loc_grid_type & loc_grid)
+    template<typename gdb_ext_type, typename loc_grid_type>
+    static void pre_load(gdb_ext_type & gdb_ext_old, loc_grid_type & loc_grid_old, gdb_ext_type & gdb_ext, loc_grid_type & loc_grid)
 	{
 		auto & dg = loc_grid.get(0);
 		auto dlin = dg.getGrid();
@@ -3475,7 +3446,12 @@ public:
 
 		getGlobalGridsInfo(gdb_ext_global);
 
-		this->map_(dec,cd_sm,loc_grid,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global);
+
+		typedef typename to_int_sequence<0,T::max_prop-1>::type result;
+
+		//variadic_caller<result>::call(this,dec,cd_sm,loc_grid,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global,opt);
+
+		this->template map_<0>(dec,cd_sm,loc_grid,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global,opt);
 
 		loc_grid_old.clear();
 		loc_grid_old.shrink_to_fit();
@@ -3513,8 +3489,14 @@ public:
 
 		if (v_cl.size() != 1)
 		{
+			// move information from host to device
+			for (int i = 0 ; i < loc_grid_old.size() ; i++)
+			{loc_grid_old.get(i).template hostToDevice<0>();}
 			// Map the distributed grid
-                	map(NO_GDB_EXT_SWITCH);
+			size_t opt_ = NO_GDB_EXT_SWITCH;
+			if (std::is_same<Memory,CudaMemory>::value == true)
+			{opt_ |= RUN_ON_DEVICE;}
+            map(opt_);
 		}
 		else
 		{
@@ -3544,6 +3526,8 @@ public:
 
 						++it_src;
 					}
+
+					dg.template hostToDevice<0>();
 			}
 		}
 	}
diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp
index d9312ef5e..9be3e3b8d 100644
--- a/src/Grid/grid_dist_id_comm.hpp
+++ b/src/Grid/grid_dist_id_comm.hpp
@@ -13,6 +13,7 @@
 #include "grid_dist_util.hpp"
 #include "util/common_pdata.hpp"
 #include "lib/pdata.hpp"
+#include "Grid/grid_common.hpp"
 
 
 /*! \brief Unpack selector
@@ -166,6 +167,7 @@ class grid_dist_id_comm
 	struct rp_id
 	{
 		int p_id;
+		int size;
 		int i;
 
 		bool operator<(const rp_id & tmp) const
@@ -182,6 +184,7 @@ class grid_dist_id_comm
 	//! first id is grid
 	//! second id is the processor id
 	openfpm::vector<openfpm::vector<aggregate<device_grid,SpaceBox<dim,long int>>>> m_oGrid;
+	openfpm::vector<int> m_oGrid_c;
 
 	//! Memory for the ghost sending buffer
 	Memory g_send_prp_mem;
@@ -393,10 +396,13 @@ class grid_dist_id_comm
 		gd->recv_buffers.last().resize(msg_i);
 		gd->recv_proc.add();
 		gd->recv_proc.last().p_id = i;
+		gd->recv_proc.last().size = msg_i;
 		gd->recv_proc.last().i = gd->recv_proc.size()-1;
 
 		if (gd->opt & RUN_ON_DEVICE)
-		{return gd->recv_buffers.last().getDevicePointer();}
+		{
+			return gd->recv_buffers.last().getDevicePointer();
+		}
 
 		return gd->recv_buffers.last().getPointer();
 	}
@@ -935,6 +941,26 @@ class grid_dist_id_comm
 		}
 	}
 
+	int find_local_sub(Box<dim, long int> & box_dst, openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext)
+	{
+		Point<dim,long int> point;
+		for (size_t n = 0; n < dim; n++)
+		{point.get(n) = (box_dst.getHigh(n) + box_dst.getLow(n))/2;}
+
+		for (size_t j = 0; j < gdb_ext.size(); j++)
+		{
+			// Local sub-domain
+			SpaceBox<dim,long int> sub = gdb_ext.get(j).Dbox;
+			sub += gdb_ext.get(j).origin;
+
+			if (sub.isInside(point) == true)
+			{
+				return j;
+			}
+		}
+		return -1;
+	}
+
 public:
 
 	/*! \brief Reconstruct the local grids
@@ -999,6 +1025,30 @@ public:
 				}
 			}
 		}
+
+		std::cout << "UNPACKING " << std::endl;
+
+		for (size_t i = 0 ; i < m_oGrid_recv.size() ; i++)
+		{
+			for (size_t j = 0 ; j < m_oGrid_recv.get(i).size() ; j++)
+			{
+				m_oGrid_recv.get(i).template get<0>(j).template deviceToHost<0>();
+				std::cout << "UNPACKING POINTS: " << m_oGrid_recv.get(i).template get<0>(j).size() << std::endl;
+				m_oGrid_recv.get(i).template get<0>(j).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE1);
+			}
+		}
+
+		for (size_t i = 0 ; i < m_oGrid_recv.size() ; i++)
+		{
+			for (size_t j = 0 ; j < m_oGrid_recv.get(i).size() ; j++)
+			{m_oGrid_recv.get(i).template get<0>(j).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE2);}
+		}
+
+		for (size_t i = 0 ; i < m_oGrid_recv.size() ; i++)
+		{
+			for (size_t j = 0 ; j < m_oGrid_recv.get(i).size() ; j++)
+			{m_oGrid_recv.get(i).template get<0>(j).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE3);}
+		}
 	}
 
 	/*! \brief Label intersection grids for mappings
@@ -1013,19 +1063,66 @@ public:
 	 * \param prc_sz For each processor the number of grids to send to
 	 *
 	 */
-	inline void labelIntersectionGridsProcessor(Decomposition & dec,
+	template<typename lambda_t>
+	inline void labelIntersectionGridsProcessor_and_pack(Decomposition & dec,
 												CellDecomposer_sm<dim,St,shift<dim,St>> & cd_sm,
 												openfpm::vector<device_grid> & loc_grid_old,
 												openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext,
 												openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext_old,
 												openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext_global,
-												openfpm::vector<openfpm::vector<aggregate<device_grid,SpaceBox<dim,long int>>>> & lbl_b,
-												openfpm::vector<size_t> & prc_sz)
+												size_t p_id_cur,
+												lambda_t f)
 	{
-		lbl_b.clear();
+		// lbl_bc.clear();
+		// lbl_bc.resize(v_cl.getProcessingUnits());
+
+		// for (int i = 0 ; i < lbl_bc.size() ; i++) 
+		// {lbl_bc.get(i) = 0;}
+
+		// // count
+
+		// for (size_t i = 0; i < gdb_ext_old.size(); i++)
+		// {
+		// 	// Local old sub-domain in global coordinates
+		// 	SpaceBox<dim,long int> sub_dom = gdb_ext_old.get(i).Dbox;
+		// 	sub_dom += gdb_ext_old.get(i).origin;
+
+		// 	for (size_t j = 0; j < gdb_ext_global.size(); j++)
+		// 	{
+		// 		size_t p_id = 0;
+
+		// 		// Intersection box
+		// 		SpaceBox<dim,long int> inte_box;
+
+		// 		// Global new sub-domain in global coordinates
+		// 		SpaceBox<dim,long int> sub_dom_new = gdb_ext_global.get(j).Dbox;
+		// 		sub_dom_new += gdb_ext_global.get(j).origin;
+
+		// 		bool intersect = false;
+
+		// 		if (sub_dom.isValid() == true && sub_dom_new.isValid() == true)
+		// 			intersect = sub_dom.Intersect(sub_dom_new, inte_box);
+
+		// 		if (intersect == true)
+		// 		{
+		// 			auto inte_box_cont = cd_sm.convertCellUnitsIntoDomainSpace(inte_box);
+
+		// 			// Get processor ID that store intersection box
+		// 			Point<dim,St> p;
+		// 			for (size_t n = 0; n < dim; n++)
+		// 				p.get(n) = (inte_box_cont.getHigh(n) + inte_box_cont.getLow(n))/2;
+
+		// 			p_id = dec.processorID(p);
+
+		// 			lbl_bc.get(p_id) += 1;
+		// 		}
+		// 	}
+		// }
+
+		// // reserve
+		// for (int i = 0 ; i < lbl_b.size() ; i++) 
+		// {lbl_b.get(i).reserve(lbl_bc.get(i));}
 
-		// resize the label buffer
-		lbl_b.resize(v_cl.getProcessingUnits());
 
 		// Label all the intersection grids with the processor id where they should go
 
@@ -1061,7 +1158,9 @@ public:
 						p.get(n) = (inte_box_cont.getHigh(n) + inte_box_cont.getLow(n))/2;
 
 					p_id = dec.processorID(p);
-					prc_sz.get(p_id)++;
+					if (p_id != p_id_cur)
+					{continue;}
+//					prc_sz.get(p_id)++;
 
 					// Transform coordinates to local
 					auto inte_box_local = inte_box;
@@ -1080,8 +1179,12 @@ public:
 					}
 
 					// Grid to send
-					device_grid gr_send(sz);
-					gr_send.setMemory();
+					//device_grid gr_send(sz);
+					//gr_send.setMemory();
+					// lbl_b.get(p_id).add();
+					// device_grid & gr_send = lbl_b.get(p_id).last().template get<0>();
+					// SpaceBox<dim,long int> & box_send = lbl_b.get(p_id).last().template get<1>();
+					// gr_send.setMemory();
 
 					// Sub iterator across intersection box inside local grid
 					grid_key_dx<dim> start = inte_box_local.getKP1();
@@ -1094,22 +1197,76 @@ public:
 					{
 						box_src.setLow(i,start.get(i));
 						box_src.setHigh(i,stop.get(i));
-						box_dst.setLow(i,0);
-						box_dst.setHigh(i,stop.get(i)-start.get(i));
+						box_dst.setLow(i,inte_box.getLow(i));
+						box_dst.setHigh(i,inte_box.getHigh(i));
 					}
 
-					gr_send.copy_to(gr,box_src,box_dst);
+					f(box_src,box_dst,gr,p_id);
+				}
+			}
+		}
 
-					aggregate<device_grid,SpaceBox<dim,long int>> aggr;
+/*		for (size_t i = 0 ; i < loc_grid_old.size() ; i++)
+		{
+			loc_grid_old.get(i).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE1);
+		}
 
-					aggr.template get<0>() = gr_send;
-					aggr.template get<1>() = inte_box;
+		for (size_t i = 0 ; i < loc_grid_old.size() ; i++)
+		{
+			loc_grid_old.get(i).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE2);
+		}
 
-					// Add to the labeling vector
-					lbl_b.get(p_id).add(aggr);
-				}
+		for (size_t i = 0 ; i < loc_grid_old.size() ; i++)
+		{
+			loc_grid_old.get(i).template removeCopyToFinalize<0>(v_cl.getmgpuContext(), rem_copy_opt::PHASE3);
+		}*/
+	}
+
+	/*! \brief Unpack 
+	 *
+	 *
+	 * 
+	 */
+	template<int ... prp>
+	void unpack_buffer_to_local_grid(openfpm::vector<device_grid> & loc_grid,
+									 openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext,
+									 ExtPreAlloc<Memory> & send_buffer,
+									 size_t sz)
+	{
+		// unpack local
+		Unpack_stat ps;
+
+		while (ps.getOffset() < sz)
+		{
+			send_buffer.reset();
+
+			Box<dim,long int> box_dst;
+			send_buffer.deviceToHost(ps.getOffset(),ps.getOffset()+sizeof(Box<dim,long int>));
+			Unpacker<Box<dim,long int>,Memory>::unpack(send_buffer,box_dst,ps);
+
+			std::cout << "BOX DEST: " << box_dst.toString() << " ||  " << ps.getOffset() << "  " << sz << std::endl;
+			int s = find_local_sub(box_dst,gdb_ext);
+			if (s == -1)
+			{std::cout << __FILE__ << ":" << __LINE__ << " map, error non-local subdomain " << std::endl;}
+
+			// convert box_dst to local
+			for (int d = 0 ; d < dim ; d++ )
+			{
+				box_dst.setLow(d, box_dst.getLow(d) - gdb_ext.get(s).origin.get(d));
+				box_dst.setHigh(d, box_dst.getHigh(d) - gdb_ext.get(s).origin.get(d));
 			}
+			
+			loc_grid.get(s).remove(box_dst);
+			auto sub2 = loc_grid.get(s).getIterator(box_dst.getKP1(),box_dst.getKP2(),0);
+			Unpacker<device_grid,Memory>::template unpack<decltype(sub2),decltype(v_cl.getmgpuContext()),prp ...>(send_buffer,sub2,loc_grid.get(s),ps,v_cl.getmgpuContext(),NONE_OPT);
+
+			std::cout << "END OFFSET" << ps.getOffset() << " " << sz << std::endl;
 		}
+
+		std::cout << "FINISHING UNPACK" << std::endl;
+
+		for (int s = 0 ; s < loc_grid.size() ; s++)
+		{loc_grid.get(s).template removeAddUnpackFinalize<prp ...>(v_cl.getmgpuContext(),0);}
 	}
 
 	/*! \brief Moves all the grids that does not belong to the local processor to the respective processor
@@ -1125,54 +1282,148 @@ public:
 	 * \param gdb_ext_global it contain the decomposition at global level
 	 *
 	 */
+	template<int ... prp>
 	void map_(Decomposition & dec,
 			  CellDecomposer_sm<dim,St,shift<dim,St>> & cd_sm,
 			  openfpm::vector<device_grid> & loc_grid,
 			  openfpm::vector<device_grid> & loc_grid_old,
 			  openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext,
 			  openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext_old,
-			  openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext_global)
+			  openfpm::vector<GBoxes<device_grid::dims>> & gdb_ext_global,
+			  size_t opt)
 	{
-		// Processor communication size
-		openfpm::vector<size_t> prc_sz(v_cl.getProcessingUnits());
+		this->opt = opt;
+
+		openfpm::vector<size_t> send_buffer_sizes(v_cl.getProcessingUnits());
+		openfpm::vector<Memory> send_buffers_;
+		openfpm::vector<ExtPreAlloc<Memory>> send_buffers;
+		send_buffers_.resize(v_cl.getProcessingUnits());
+		send_buffers.resize(v_cl.getProcessingUnits());
+
+		send_prc_queue.clear();
+		send_pointer.clear();
+		send_size.clear();
+
+		for (int p_id = 0 ; p_id < v_cl.getProcessingUnits() ; p_id++)
+		{
+			for (int i = 0 ; i < loc_grid_old.size() ; i++)
+			{loc_grid_old.get(i).packReset();}
+
+			auto l = [&](Box<dim,long int> & box_src,
+						Box<dim,long int> & box_dst,
+						device_grid & gr,
+						size_t p_id){
+						//gr_send.copy_to(gr,box_src,box_dst);
+
+						Packer<SpaceBox<dim,long int>,BMemory<Memory>>::packRequest(box_dst,send_buffer_sizes.get(p_id));
+
+						auto sub_it = gr.getIterator(box_src.getKP1(),box_src.getKP2(),0);
+						gr.template packRequest<prp ...>(sub_it,send_buffer_sizes.get(p_id));
+
+						//box_send = inte_box;
+			};
+
+			// Contains the processor id of each box (basically where they have to go)
+			labelIntersectionGridsProcessor_and_pack(dec,cd_sm,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global,p_id,l);
+
+			for (int i = 0 ; i < loc_grid_old.size(); i++)
+			{
+				loc_grid_old.get(i).template packCalculate<prp ...>(send_buffer_sizes.get(p_id),v_cl.getmgpuContext());
+			}
+
+			std::cout << "Buffer " << p_id << "  " << send_buffer_sizes.get(p_id) << std::endl;
+
+			send_buffers_.get(p_id).resize(send_buffer_sizes.get(p_id));
+			send_buffers.get(p_id).setMemory(send_buffer_sizes.get(p_id),send_buffers_.get(p_id));
+			send_buffers.get(p_id).incRef();
+
+			// we now pack
+			Pack_stat sts;
+
+			auto lp = [&](Box<dim,long int> & box_src,
+						Box<dim,long int> & box_dst,
+						device_grid & gr,
+						size_t p_id){
+
+						// Print box_dst
+						std::cout << "Box dst: " << box_dst.toString() << std::endl;
+
+						size_t offset = send_buffers.get(p_id).getOffsetEnd();
+						Packer<Box<dim,long int>,Memory>::pack(send_buffers.get(p_id),box_dst,sts);
+						size_t offset2 = send_buffers.get(p_id).getOffsetEnd();
 
-		// Contains the processor id of each box (basically where they have to go)
-		labelIntersectionGridsProcessor(dec,cd_sm,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global,m_oGrid,prc_sz);
+						send_buffers.get(p_id).hostToDevice(offset,offset2);
 
-		// Calculate the sending buffer size for each processor, put this information in
-		// a contiguous buffer
-		p_map_req.resize(v_cl.getProcessingUnits());
+						auto sub_it = gr.getIterator(box_src.getKP1(),box_src.getKP2(),0);
 
-		// Vector of number of sending grids for each involved processor
-		openfpm::vector<size_t> prc_sz_r;
-		// Vector of ranks of involved processors
-		openfpm::vector<size_t> prc_r;
+						Packer<device_grid,Memory>::template pack<decltype(sub_it),prp ...>(send_buffers.get(p_id),gr,sub_it,sts);
+
+						std::cout << "SB: " << send_buffers.get(p_id).getOffsetEnd() << std::endl;
+			};
+
+			// Contains the processor id of each box (basically where they have to go)
+			labelIntersectionGridsProcessor_and_pack(dec,cd_sm,loc_grid_old,gdb_ext,gdb_ext_old,gdb_ext_global,p_id,lp);
+
+			for (int i = 0 ; i < loc_grid_old.size() ; i++)
+			{
+				loc_grid_old.get(i).template packFinalize<prp ...>(send_buffers.get(p_id),sts,0,false);
+			}
+		}
 
-		for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
+		// std::cout << "Local buffer: " << send_buffers.get(v_cl.rank()).size() << std::endl;
+		// int sz = send_buffers.get(v_cl.rank()).size();
+		//send_buffers.get(v_cl.rank()).reset();
+
+		// // Print all the byte in send_buffers_
+		// for (int j = 0 ; j < 16 && j < sz ; j++) {
+		// 	std::cout << "Local buffer " << v_cl.rank() << " " << ((long int *)send_buffers.get(v_cl.rank()).getPointer())[j] << " " << &((long int *)send_buffers.get(v_cl.rank()).getPointer())[j] << std::endl;
+		// }
+
+		unpack_buffer_to_local_grid(loc_grid,gdb_ext,send_buffers.get(v_cl.rank()),send_buffers.get(v_cl.rank()).size());
+
+		//openfpm::vector<void *> send_pointer;
+		//openfpm::vector<int> send_size;
+		for (int i = 0 ; i < send_buffers.size() ; i++)
 		{
-			if (m_oGrid.get(i).size() != 0)
+			if (i != v_cl.rank())
 			{
-				p_map_req.get(i) = prc_r.size();
-				prc_r.add(i);
-				prc_sz_r.add(m_oGrid.get(i).size());
+				send_pointer.add(send_buffers_.get(i).getDevicePointer());
+				send_size.add(send_buffers_.get(i).size());
+				send_prc_queue.add(i);
+
+				// Print sending buffer size
+				std::cout << "Sending to " << i << " " << send_buffers_.get(i).size() << " bytes" << std::endl;
 			}
 		}
 
-		decltype(m_oGrid) m_oGrid_new;
-		for (size_t i = 0; i < v_cl.getProcessingUnits(); i++)
+		size_t * send_size_ptr = NULL;
+		size_t * send_prc_queue_ptr = NULL;
+		void ** send_pointer_ptr = NULL;
+
+		if (send_size.size() != 0)
 		{
-			if (m_oGrid.get(i).size() != 0)
-			{m_oGrid_new.add(m_oGrid.get(i));}
+			send_size_ptr = &send_size.get(0);
+			send_pointer_ptr = &send_pointer.get(0);
+			send_prc_queue_ptr = &send_prc_queue.get(0);
 		}
 
-		// Vector for receiving of intersection grids
-		openfpm::vector<openfpm::vector<aggregate<device_grid,SpaceBox<dim,long int>>>> m_oGrid_recv;
+		recv_buffers.clear();
+		recv_proc.clear();
+
+		v_cl.sendrecvMultipleMessagesNBX(send_pointer.size(),send_size_ptr,
+											 send_prc_queue_ptr,send_pointer_ptr,
+											 receive_dynamic,this);
+
 
-		// Send and recieve intersection grids
-		v_cl.SSendRecv(m_oGrid_new,m_oGrid_recv,prc_r,prc_recv_map,recv_sz_map);
+		for (int i = 0 ; i < recv_buffers.size() ; i++)
+		{
+			ExtPreAlloc<Memory> prAlloc_;
+			prAlloc_.setMemory(recv_buffers.get(i).size(),recv_buffers.get(i));
+			unpack_buffer_to_local_grid<prp ...>(loc_grid,gdb_ext,prAlloc_,recv_proc.get(i).size);
+		}
 
-		// Reconstruct the new local grids
-		grids_reconstruct(m_oGrid_recv,loc_grid,gdb_ext,cd_sm);
+		for (int i = 0 ; i < send_buffers.size() ; i++)
+		{send_buffers.get(i).decRef();}
 	}
 
 	/*! \brief It fill the ghost part of the grids
diff --git a/src/Grid/tests/grid_dist_id_dlb_unit_test.cpp b/src/Grid/tests/grid_dist_id_dlb_unit_test.cpp
index bad7d6e1b..20ab1d307 100644
--- a/src/Grid/tests/grid_dist_id_dlb_unit_test.cpp
+++ b/src/Grid/tests/grid_dist_id_dlb_unit_test.cpp
@@ -102,6 +102,12 @@ void test_vector_grid_dlb()
 			check &= gdist.template get<1>(p) == gkey.get(1);
 			check &= gdist.template get<2>(p) == gkey.get(2);
 
+			if (check == false)
+			{
+				std::cout << "Error " << gdist.template get<0>(p) << " != " << gkey.get(0) << " " << gdist.template get<1>(p) << " != " << gkey.get(1) << " " << gdist.template get<2>(p) << " != " << gkey.get(2) << std::endl;
+				break;
+			}
+
 			++it;
 		}
 
diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
index 830ca9bdb..e80a49d3f 100644
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -190,6 +190,7 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output )
 template<typename grid, typename box_type>
 void check_sgrid(grid & gdist2, box_type & box, float c)
 {
+	size_t n_point = 0;
 	bool match = true;
 	auto it2 = gdist2.getDomainIterator();
 
@@ -216,7 +217,7 @@ void check_sgrid(grid & gdist2, box_type & box, float c)
 			if (match == false)
 			{
 				std::cout << gdist2.template get<0>(p_xp1) << "   " << c + key_xp1.get(0) + key_xp1.get(1) << std::endl;
-				std::cout << "1 " << p_xp1.getKey().toPoint().to_string() << " " << &gdist2.template get<0>(p_xp1) << std::endl;
+				std::cout << "1 " << key_xp1.to_string() << "   " << p_xp1.getKey().toPoint().to_string() << " " << &gdist2.template get<0>(p_xp1) << std::endl;
 				break;
 			}
 		}
@@ -257,10 +258,87 @@ void check_sgrid(grid & gdist2, box_type & box, float c)
 			}
 		}
 
+		n_point++;
+
+		++it2;
+	}
+
+	auto & v_cl = create_vcluster();
+
+	v_cl.sum(n_point);
+	v_cl.execute();
+
+	BOOST_REQUIRE_EQUAL(match,true);
+	BOOST_REQUIRE_EQUAL(n_point,350*350);
+}
+
+template<typename grid, typename box_type>
+void check_sgrid_no_ghost(grid & gdist2, box_type & box, float c)
+{
+	size_t n_point = 0;
+	bool match = true;
+	auto it2 = gdist2.getDomainIterator();
+
+	while (it2.isNext())
+	{
+		auto p = it2.get();
+
+		auto key = it2.getGKey(p);
+
+		match &= gdist2.template get<0>(p) == c + key.get(0) + key.get(1);
+
+		if (match == false)
+		{
+			std::cout << gdist2.template get<0>(p) << "   " << c + key.get(0) + key.get(1) << std::endl;
+			std::cout << "1 " << key.to_string() << "   " << p.getKey().toPoint().to_string() << " " << &gdist2.template get<0>(p) << std::endl;
+			break;
+		}
+
+		n_point++;
+
 		++it2;
 	}
 
+	auto & v_cl = create_vcluster();
+
+	v_cl.sum(n_point);
+	v_cl.execute();
+
 	BOOST_REQUIRE_EQUAL(match,true);
+	BOOST_REQUIRE_EQUAL(n_point,350*350);
+}
+
+BOOST_AUTO_TEST_CASE( sgrid_gpu_test_load_from_file )
+{
+	auto & v_cl = create_vcluster();
+
+	float c = 5.0;
+
+	if (v_cl.size() > 8){return;}
+
+	size_t sz[2] = {370,370};
+	periodicity<2> bc = {PERIODIC,PERIODIC};
+
+	Ghost<2,long int> g(1);
+
+	Box<2,float> domain({0.0,0.0},{1.0,1.0});
+
+	/////// GPU insert + flush
+
+	Box<2,size_t> box({1,1},{350,350});
+
+	// Now load
+
+	sgrid_dist_id_gpu<2,float,aggregate<float,float>> gdist2(sz,domain,g,bc);
+
+	gdist2.load("test_data/sgrid_gpu_output_hdf5");
+	gdist2.deviceToHost<0,1>();
+	check_sgrid_no_ghost(gdist2,box,c);
+	gdist2.template hostToDevice<0>();
+	gdist2.template ghost_get<0,1>(RUN_ON_DEVICE);
+
+	gdist2.deviceToHost<0,1>();
+	check_sgrid(gdist2,box,c);
 }
 
 BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
@@ -291,7 +369,7 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
 
 	float c = 5.0;
 
-	gdist.addPoints([] __device__ (int i, int j)
+	gdist.addPoints(box.getKP1(),box.getKP2(),[] __device__ (int i, int j)
 			        {
 						return true;
 			        },
@@ -305,98 +383,22 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load )
 	gdist.template flush<smax_<0>,smax_<1>>(flush_type::FLUSH_ON_DEVICE);
 
 	gdist.template deviceToHost<0>();
-
 	gdist.save("sgrid_gpu_output_hdf5");
-	check_sgrid(gdist,box,c);
+	gdist.write("sgrid_conf");
 
 	// Now load
 
 	sgrid_dist_id_gpu<2,float,aggregate<float,float>> gdist2(sz,domain,g,bc);
 
 	gdist2.load("sgrid_gpu_output_hdf5");
-	check_sgrid(gdist2,box,c);
+	gdist2.deviceToHost<0,1>();
+	check_sgrid_no_ghost(gdist2,box,c);
 	gdist2.template hostToDevice<0>();
-
 	gdist2.template ghost_get<0,1>(RUN_ON_DEVICE);
 
 	gdist2.deviceToHost<0,1>();
 	gdist.deviceToHost<0,1>();
 	check_sgrid(gdist2,box,c);
-
-/*	bool match = true;
-
-
-	auto it2 = gdist2.getDomainIterator();
-
-	while (it2.isNext())
-	{
-		auto p = it2.get();
-
-		auto key = it2.getGKey(p);
-
-		auto p_xp1 = p.move(0,1);
-		auto p_xm1 = p.move(0,-1);
-		auto p_yp1 = p.move(1,1);
-		auto p_ym1 = p.move(1,-1);
-
-		auto key_xp1 = key.move(0,1);
-		auto key_xm1 = key.move(0,-1);
-		auto key_yp1 = key.move(1,1);
-		auto key_ym1 = key.move(1,-1);
-
-		if (box.isInside(key_xp1.toPoint()))
-		{
-			match &= gdist.template get<0>(p_xp1) == c + key_xp1.get(0) + key_xp1.get(1);
-
-			if (match == false)
-			{
-				std::cout << gdist.template get<0>(p_xp1) << "   " << c + key_xp1.get(0) + key_xp1.get(1) << std::endl;
-				std::cout << key_xp1.to_string() << std::endl;
-				break;
-			}
-		}
-
-		if (box.isInside(key_xm1.toPoint()))
-		{
-			match &= gdist.template get<0>(p_xm1) == c + key_xm1.get(0) + key_xm1.get(1);
-
-			if (match == false)
-			{
-				std::cout << gdist.template get<0>(p_xm1) << "   " << c + key_xm1.get(0) + key_xm1.get(1) << std::endl;
-				std::cout << key_xm1.to_string() << std::endl;
-				break;
-			}
-		}
-
-		if (box.isInside(key_yp1.toPoint()))
-		{
-			match &= gdist.template get<0>(p_yp1) == c + key_yp1.get(0) + key_yp1.get(1);
-
-			if (match == false)
-			{
-				std::cout << gdist.template get<0>(p_yp1) << "   " << c + key_yp1.get(0) + key_yp1.get(1) << std::endl;
-				std::cout << key_yp1.to_string() << std::endl;
-				break;
-			}
-		}
-
-		if (box.isInside(key_ym1.toPoint()))
-		{
-			match &= gdist.template get<0>(p_ym1) == c + key_ym1.get(0) + key_ym1.get(1);
-
-			if (match == false)
-			{
-				std::cout << gdist.template get<0>(p_ym1) << "   " << c + key_ym1.get(0) + key_ym1.get(1) << std::endl;
-				std::cout << key_ym1.to_string() << std::endl;
-				break;
-			}
-		}
-
-		++it2;
-	}
-
-
-	BOOST_REQUIRE_EQUAL(match,true);*/
 }
 
 void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2])
-- 
GitLab