From 97cce61f3ffcd1909f2c2ff0ef9dca6009e5fae0 Mon Sep 17 00:00:00 2001
From: Pietro Incardona <i-bird@localhost.localdomain>
Date: Thu, 14 Jul 2016 01:58:27 +0200
Subject: [PATCH] Fixing ghost unbound with dirty

---
 CHANGELOG.md                                  |   1 +
 openfpm_data                                  |   2 +-
 openfpm_devices                               |   2 +-
 script/install_LIBHILBERT.sh                  |  19 ++
 src/Decomposition/CartDecomposition.hpp       |  34 +--
 src/Decomposition/common.hpp                  |  20 +-
 src/Decomposition/ie_ghost.hpp                |  16 ++
 src/Graph/dist_map_graph.hpp                  |  16 +-
 src/Graph/dist_map_graph_unit_test.hpp        |   2 +-
 src/Grid/grid_dist_id.hpp                     | 177 +++++++++++-
 src/Grid/grid_dist_id_unit_test.hpp           |  23 ++
 src/Grid/grid_dist_id_unit_test_unb_ghost.hpp | 255 ++++++++++++++++++
 src/dec_optimizer.hpp                         |  78 ++++--
 src/dec_optimizer_unit_test.hpp               |  11 +-
 14 files changed, 577 insertions(+), 79 deletions(-)
 create mode 100755 script/install_LIBHILBERT.sh
 create mode 100644 src/Grid/grid_dist_id_unit_test_unb_ghost.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7432543a..562756fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ All notable changes to this project will be documented in this file.
 - Hilber curve data and computation reordering for cache firndliness
 
 ### Fixed
+- Removed small crash for small grid and big number of processors
 
 ### Changed
 
diff --git a/openfpm_data b/openfpm_data
index 9d44c008..dfe00860 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 9d44c008f04e855666b1ea15d02e681f2bfcf2e9
+Subproject commit dfe008605b9167afd8339078cba5c779a38c376b
diff --git a/openfpm_devices b/openfpm_devices
index efd6339f..5868264d 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit efd6339ff5d5d73528b07771ffde668787dee826
+Subproject commit 5868264de4c5d9a684b48f492efbfd26798f1d18
diff --git a/script/install_LIBHILBERT.sh b/script/install_LIBHILBERT.sh
new file mode 100755
index 00000000..d1562c15
--- /dev/null
+++ b/script/install_LIBHILBERT.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+
+# check if the directory $1/HDF5 exist
+
+if [ -d "$1/LIBHILBERT" ]; then
+  echo "LIBHILBERT already installed"
+  exit 0
+fi
+
+wget http://ppmcore.mpi-cbg.de/upload/libhilbert-master.zip
+rm -rf libhilbert-master
+unzip libhilbert-master.zip
+cd libhilbert-master
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=$1/LIBHILBERT ..
+make all
+make install
+
diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp
index ded5a478..212408c8 100755
--- a/src/Decomposition/CartDecomposition.hpp
+++ b/src/Decomposition/CartDecomposition.hpp
@@ -243,8 +243,18 @@ public:
 		// set of Boxes produced by the decomposition optimizer
 		openfpm::vector<::Box<dim, size_t>> loc_box;
 
+		// Ghost
+		Ghost<dim,long int> ghe;
+
+		// Set the ghost
+		for (size_t i = 0 ; i < dim ; i++)
+		{
+			ghe.setLow(i,static_cast<long int>(ghost.getLow(i)/spacing[i]) - 1);
+			ghe.setHigh(i,static_cast<long int>(ghost.getHigh(i)/spacing[i]) + 1);
+		}
+
 		// optimize the decomposition
-		d_o.template optimize<nm_v::sub_id, nm_v::proc_id>(dist.getGraph(), p_id, loc_box, box_nn_processor,bc);
+		d_o.template optimize<nm_v::sub_id, nm_v::proc_id>(dist.getGraph(), p_id, loc_box, box_nn_processor,ghe,bc);
 
 		// reset ss_box
 		ss_box = domain;
@@ -486,19 +496,6 @@ public:
 	 */
 	void calculateGhostBoxes()
 	{
-#ifdef DEBUG
-		// the ghost margins are assumed to be smaller
-		// than one sub-domain
-
-		for (size_t i = 0; i < dim; i++)
-		{
-			if (fabs(ghost.template getLow(i)) >= ss_box.getHigh(i) || ghost.template getHigh(i) >= ss_box.getHigh(i))
-			{
-				std::cerr << "Error " << __FILE__ << ":" << __LINE__  << " : Ghost are bigger than one sub-domain" << "\n";
-			}
-		}
-#endif
-
 		// Intersect all the local sub-domains with the sub-domains of the contiguous processors
 
 		// create the internal structures that store ghost information
@@ -506,15 +503,6 @@ public:
 		ie_ghost<dim, T>::create_box_nn_processor_int(v_cl, ghost, sub_domains, box_nn_processor, *this);
 
 		ie_loc_ghost<dim,T>::create(sub_domains,domain,ghost,bc);
-
-		// get the smallest sub-domain dimension on each direction
-		for (size_t i = 0; i < dim; i++)
-		{
-			if (fabs(ghost.template getLow(i)) >= ss_box.getHigh(i) || ghost.template getHigh(i) >= ss_box.getHigh(i))
-			{
-				std::cerr << "Error " << __FILE__ << ":" << __LINE__  << " : Ghost are bigger than one sub-domain" << "\n";
-			}
-		}
 	}
 
 public:
diff --git a/src/Decomposition/common.hpp b/src/Decomposition/common.hpp
index 61984553..4278a232 100755
--- a/src/Decomposition/common.hpp
+++ b/src/Decomposition/common.hpp
@@ -52,28 +52,13 @@ struct Box_loc_sub
 
 };
 
-/*! It contain a box definition and from witch sub-domain it come from (in the local processor)
+/*! It contain a box definition and from witch sub-domain it come from (in the local processor list)
  * and an unique if across adjacent processors (for communication)
  *
  * If the box come from the intersection of an expanded sub-domain and a sub-domain
  *
  * Assuming we are considering the near processors i (0 to getNNProcessors())
  *
- * ### external ghost box
- *
- * id = id_exp * N_non_exp + id_non_exp
- *
- * id_exp = the id in the vector proc_adj_box.get(i) of the expanded sub-domain (sent local sub-domains)
- *
- * id_non_exp = the id in the vector nn_processor_subdomains[i] of the sub-domain (received sub-domains from near processors)
- *
- * ### internal ghost box
- *
- * id = id_exp * N_non_exp + id_non_exp
- *
- * id_exp = the id in the vector nn_processor_subdomains[i] of the expanded sub-domain
- *
- * id_non_exp = the id in the vector proc_adj_box.get(i) of the sub-domain
  *
  */
 template<unsigned int dim, typename T>
@@ -88,6 +73,9 @@ struct Box_sub
 	//! see ebx_ibx_form in ie_ghost for the meaning
 	size_t id;
 
+	//! see getNearSubdomainsRealId in nn_prcs
+	size_t r_sub;
+
 	//! see ie_ghost follow sector explanation
 	comb<dim> cmb;
 };
diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp
index 16eed5be..8b1923b6 100755
--- a/src/Decomposition/ie_ghost.hpp
+++ b/src/Decomposition/ie_ghost.hpp
@@ -378,6 +378,7 @@ protected:
 						Box_sub<dim,T> sb;
 						sb.bx = b_int.box;
 						sb.sub = i;
+						sb.r_sub = r_sub.get(k);
 						sb.cmb = nn_p_box_pos.get(k);
 
 						size_t p_idp = nn_p.ProctoID(p_id);
@@ -615,6 +616,21 @@ public:
 		return proc_int_box.get(id).ebx.get(j).id;
 	}
 
+	/*! \brief Get the sub-domain send-id at witch belong the internal ghost box
+	 *
+	 * The internal ghost box is create from the intersection a local sub-domain
+	 * and an extended sub-domain communicated from another processor. This function
+	 * return the id of the sub-domain in the receiving list
+	 *
+	 * \param id adjacent processor list id (the id go from 0 to getNNProcessor())
+	 * \param j box (each near processor can produce more than one internal ghost box)
+	 * \return sub-domain at which belong the internal ghost box
+	 *
+	 */
+	inline size_t getProcessorIGhostSSub(size_t id, size_t j) const
+	{
+		return proc_int_box.get(id).ibx.get(j).r_sub;
+	}
 
 	/*! \brief Get the local sub-domain at witch belong the internal ghost box
 	 *
diff --git a/src/Graph/dist_map_graph.hpp b/src/Graph/dist_map_graph.hpp
index b8aeaca7..208ea069 100644
--- a/src/Graph/dist_map_graph.hpp
+++ b/src/Graph/dist_map_graph.hpp
@@ -495,6 +495,8 @@ class DistGraph_CSR
 			}
 		}
 
+		recv_g.fvtxdist = fvtxdist;
+
 		// Swap temporary graph with the main one
 		swap(recv_g);
 
@@ -648,7 +650,7 @@ class DistGraph_CSR
 		}
 
 		// Exchange informations through processors
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), gr_receive, &packs, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), gr_receive, &packs, NONE);
 
 		for (size_t i = 0; i < vcl.getProcessingUnits(); i++)
 		{
@@ -802,7 +804,7 @@ class DistGraph_CSR
 		fillSendRecvStructs<size_t>(on_info, prc, size, ptr);
 
 		// Send on_info
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), on_receive, &on_vs, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), on_receive, &on_vs, NONE);
 
 		// Insert in the on_toup map the received couples
 		for (size_t i = 0; i < vcl.getProcessingUnits(); i++)
@@ -870,7 +872,7 @@ class DistGraph_CSR
 		fillSendRecvStructs<size_t>(vni, prc, size, ptr);
 
 		// Send and receive requests
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), on_receive, &req_rmi, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), on_receive, &req_rmi, NONE);
 
 		// Re-mapping info map
 		openfpm::vector<openfpm::vector<size_t>> rmi(vcl.getProcessingUnits());
@@ -891,7 +893,7 @@ class DistGraph_CSR
 		fillSendRecvStructs<size_t>(resp_rmi, prc, size, ptr);
 
 		// Send responses
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), on_receive, &rmi, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), on_receive, &rmi, NONE);
 
 		// Add received info into re-mapping info map
 		for (size_t i = 0; i < rmi.size(); ++i)
@@ -2197,7 +2199,7 @@ public:
 		fillSendRecvStructs<size_t>(vr_queue, prc, size, ptr);
 
 		// Send/receive requests for info about needed vertices
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), on_receive, &resp, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), on_receive, &resp, NONE);
 
 		// Prepare responses with the containing processors of requested vertices
 		for (size_t i = 0; i < resp.size(); ++i)
@@ -2224,7 +2226,7 @@ public:
 		resp.resize(vcl.getProcessingUnits());
 
 		// Send/receive responses with the containing processors of requested vertices
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), on_receive, &resp, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), on_receive, &resp, NONE);
 
 		// Clear requests array
 		reqs.clear();
@@ -2259,7 +2261,7 @@ public:
 		resp.resize(vcl.getProcessingUnits());
 
 		// Send/receive vertices requests
-		vcl.sendrecvMultipleMessagesNBX(prc.size(), &size.get(0), &prc.get(0), &ptr.get(0), on_receive, &resp, NONE);
+		vcl.sendrecvMultipleMessagesNBX(prc.size(), (size_t *)size.getPointer(), (size_t *)prc.getPointer(), (void **)ptr.getPointer(), on_receive, &resp, NONE);
 
 		for (size_t i = 0; i < resp.size(); ++i)
 		{
diff --git a/src/Graph/dist_map_graph_unit_test.hpp b/src/Graph/dist_map_graph_unit_test.hpp
index 6a4b606c..184cb4f8 100644
--- a/src/Graph/dist_map_graph_unit_test.hpp
+++ b/src/Graph/dist_map_graph_unit_test.hpp
@@ -415,7 +415,7 @@ BOOST_AUTO_TEST_CASE( dist_map_graph_use_free_add)
 	gd.sync();
 
 	if(vcl.getProcessUnitID() == 0)
-		BOOST_REQUIRE_EQUAL(gd.getVertexId(5), 5ul);
+		BOOST_REQUIRE_EQUAL(gd.getVertexId(4), 5ul);
 
 	gd.deleteGhosts();
 
diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp
index 358a5e59..f904ad9d 100644
--- a/src/Grid/grid_dist_id.hpp
+++ b/src/Grid/grid_dist_id.hpp
@@ -25,7 +25,9 @@ template<unsigned int dim>
 struct Box_fix
 {
 	Box<dim,size_t> bx;
+	comb<dim> cmb;
 	size_t g_id;
+	size_t r_sub;
 };
 
 #define GRID_SUB_UNIT_FACTOR 64
@@ -87,6 +89,8 @@ class grid_dist_id
 	//! It is unique across all the near processor
 	std::unordered_map<size_t,size_t> g_id_to_external_ghost_box;
 
+	std::unordered_map<size_t,size_t> g_id_to_external_ghost_box_fix;
+
 	//! It map a global ghost id (g_id) to the internal ghost box information
 	//! (is unique for processor), it is not unique across all the near processor
 	openfpm::vector<std::unordered_map<size_t,size_t>> g_id_to_internal_ghost_box;
@@ -131,6 +135,36 @@ class grid_dist_id
 		return g->recv_mem_gg.get(lc_id).getPointer();
 	}
 
+	/*! \brief flip box just convert and internal ghost box into an external ghost box
+	 *
+	 *
+	 */
+	Box<dim,long int> flip_box(const Box<dim,long int> & box, const comb<dim> & cmb)
+	{
+		Box<dim,long int> flp;
+
+		for (size_t i = 0 ; i < dim; i++)
+		{
+			if (cmb[i] == 0)
+			{
+				flp.setLow(i,box.getLow(i));
+				flp.setHigh(i,box.getHigh(i));
+			}
+			else if (cmb[i] == 1)
+			{
+				flp.setLow(i,box.getLow(i) + ginfo.size(i));
+				flp.setHigh(i,box.getHigh(i) + ginfo.size(i));
+			}
+			else if (cmb[i] == -1)
+			{
+				flp.setLow(i,box.getLow(i) - ginfo.size(i));
+				flp.setHigh(i,box.getHigh(i) - ginfo.size(i));
+			}
+		}
+
+		return flp;
+	}
+
 	/*! \brief Create per-processor internal ghost boxes list in grid units and g_id_to_external_ghost_box
 	 *
 	 */
@@ -169,6 +203,7 @@ class grid_dist_id
 				bid_t.g_id = dec.getProcessorIGhostId(i,j);
 				bid_t.sub = dec.getProcessorIGhostSub(i,j);
 				bid_t.cmb = dec.getProcessorIGhostPos(i,j);
+				bid_t.r_sub = dec.getProcessorIGhostSSub(i,j);
 				pib.bid.add(bid_t);
 
 				g_id_to_internal_ghost_box.get(i)[bid_t.g_id] = pib.bid.size()-1;
@@ -233,6 +268,97 @@ class grid_dist_id
 		}
 
 		init_e_g_box = true;
+
+		// Communicate the ig_box calculated to the other processor
+
+		comb<dim> zero;
+		zero.zero();
+
+		// Here we collect all the calculated internal ghost box in the sector different from 0 that this processor has
+
+		openfpm::vector<size_t> prc;
+		openfpm::vector<size_t> prc_recv;
+		openfpm::vector<size_t> sz_recv;
+		openfpm::vector<openfpm::vector<Box_fix<dim>>> box_int_send(dec.getNNProcessors());
+		openfpm::vector<openfpm::vector<Box_fix<dim>>> box_int_recv;
+
+		for(size_t i = 0 ; i < dec.getNNProcessors() ; i++)
+		{
+			for (size_t j = 0 ; j < ig_box.get(i).bid.size() ; j++)
+			{
+				box_int_send.get(i).add();
+				box_int_send.get(i).last().bx = ig_box.get(i).bid.get(j).box;
+				box_int_send.get(i).last().g_id = ig_box.get(i).bid.get(j).g_id;
+				box_int_send.get(i).last().r_sub = ig_box.get(i).bid.get(j).r_sub;
+				box_int_send.get(i).last().cmb = ig_box.get(i).bid.get(j).cmb;
+			}
+			prc.add(dec.IDtoProc(i));
+		}
+
+		v_cl.SSendRecv(box_int_send,box_int_recv,prc,prc_recv,sz_recv);
+
+		eg_box_tmp.resize(dec.getNNProcessors());
+
+		for (size_t i = 0 ; i < eg_box_tmp.size() ; i++)
+			eg_box_tmp.get(i).prc = dec.IDtoProc(i);
+
+		for (size_t i = 0 ; i < box_int_recv.size() ; i++)
+		{
+			size_t p_id = dec.ProctoID(prc_recv.get(i));
+			auto&& pib = eg_box_tmp.get(p_id);
+			pib.prc = prc_recv.get(i);
+
+			// For each received internal ghost box
+			for (size_t j = 0 ; j < box_int_recv.get(i).size() ; j++)
+			{
+				size_t send_list_id = box_int_recv.get(i).get(j).r_sub;
+
+				// Get the list of the sent sub-domains
+				// and recover the id of the sub-domain from
+				// the sent list
+				const openfpm::vector<size_t> & s_sub = dec.getSentSubdomains(p_id);
+				size_t sub_id = s_sub.get(send_list_id);
+
+				e_box_id bid_t;
+				bid_t.sub = sub_id;
+				bid_t.cmb = box_int_recv.get(i).get(j).cmb;
+				bid_t.cmb.sign_flip();
+				::Box<dim,long int> ib = flip_box(box_int_recv.get(i).get(j).bx,box_int_recv.get(i).get(j).cmb);
+				bid_t.g_e_box = ib;
+				bid_t.g_id = box_int_recv.get(i).get(j).g_id;
+				// Translate in local coordinate
+				Box<dim,long int> tb = ib;
+				tb -= gdb_ext.get(sub_id).origin;
+				bid_t.l_e_box = tb;
+
+				pib.bid.add(bid_t);
+
+				g_id_to_external_ghost_box_fix[bid_t.g_id] = pib.bid.size()-1;
+
+				size_t l_id = 0;
+				// convert the global id into local id
+				auto key = g_id_to_external_ghost_box.find(bid_t.g_id);
+				if (key != g_id_to_external_ghost_box.end()) // FOUND
+					l_id = key->second;
+
+				Box<dim,long int> box_le = eg_box.get(p_id).bid.get(l_id).l_e_box;
+				Box<dim,long int> box_ge = eg_box.get(p_id).bid.get(l_id).g_e_box;
+
+				if (box_le != bid_t.l_e_box || box_ge != bid_t.g_e_box ||
+						eg_box.get(p_id).bid.get(l_id).cmb != bid_t.cmb ||
+						eg_box.get(p_id).bid.get(l_id).sub != bid_t.sub)
+				{
+					int debug = 0;
+					debug++;
+				}
+			}
+		}
+
+		// switch
+
+		g_id_to_external_ghost_box = g_id_to_external_ghost_box_fix;
+		eg_box.clear();
+		eg_box = eg_box_tmp;
 	}
 
 	bool init_local_i_g_box = false;
@@ -268,6 +394,7 @@ class grid_dist_id
 				pib.bid.last().box = ib;
 				pib.bid.last().sub = dec.getLocalIGhostSub(i,j);
 				pib.bid.last().k = dec.getLocalIGhostE(i,j);
+				pib.bid.last().cmb = dec.getLocalIGhostPos(i,j);
 			}
 		}
 
@@ -286,7 +413,7 @@ class grid_dist_id
 
 		if (init_local_e_g_box == true)	return;
 
-		// Get the number of near processors
+		// Get the number of sub-domain
 		for (size_t i = 0 ; i < dec.getNSubDomain() ; i++)
 		{
 			loc_eg_box.add();
@@ -307,11 +434,41 @@ class grid_dist_id
 				pib.bid.last().box = ib;
 				pib.bid.last().sub = dec.getLocalEGhostSub(i,j);
 				pib.bid.last().cmb = dec.getLocalEGhostPos(i,j);
-				pib.bid.last().cmb.sign_flip();
 			}
 		}
 
 		init_local_e_g_box = true;
+
+		loc_eg_box_tmp.resize(dec.getNSubDomain());
+
+		// Get the number of sub-domain
+		for (size_t i = 0 ; i < dec.getNSubDomain() ; i++)
+		{
+			for (size_t j = 0 ; j < loc_ig_box.get(i).bid.size() ; j++)
+			{
+				size_t k = loc_ig_box.get(i).bid.get(j).sub;
+				auto & pib = loc_eg_box_tmp.get(k);
+
+				size_t s = loc_ig_box.get(i).bid.get(j).k;
+				pib.bid.resize(dec.getLocalNEGhost(k));
+
+				pib.bid.get(s).box = flip_box(loc_ig_box.get(i).bid.get(j).box,loc_ig_box.get(i).bid.get(j).cmb);
+				pib.bid.get(s).sub = dec.getLocalEGhostSub(k,s);
+				pib.bid.get(s).cmb = loc_ig_box.get(i).bid.get(j).cmb;
+				pib.bid.get(s).cmb.sign_flip();
+
+				if (pib.bid.get(s).box != loc_eg_box.get(k).bid.get(s).box &&
+					pib.bid.get(s).cmb != loc_eg_box.get(k).bid.get(s).cmb &&
+					pib.bid.get(s).sub != loc_eg_box.get(k).bid.get(s).sub)
+				{
+					std::cout << "CAZZO" << std::endl;
+					int debug = 0;
+					debug++;
+				}
+			}
+		}
+
+		loc_eg_box = loc_eg_box_tmp;
 	}
 
 	/*! \brief Sync the local ghost part
@@ -334,7 +491,7 @@ class grid_dist_id
 				// sub domain connected with external box
 				size_t sub_id_dst = loc_ig_box.get(i).bid.get(j).sub;
 
-				// local external ghost box connected
+				// local internal ghost box connected
 				size_t k = loc_ig_box.get(i).bid.get(j).k;
 
 				Box<dim,size_t> bx_dst = loc_eg_box.get(sub_id_dst).bid.get(k).box;
@@ -1114,9 +1271,14 @@ public:
 		//! id
 		size_t g_id;
 
+		//! r_sub id of the sub-domain in the sent list
+		size_t r_sub;
+
 		//! Sector where it live the linked external ghost box
 		comb<dim> cmb;
 
+
+
 		//! sub
 		size_t sub;
 	};
@@ -1133,8 +1295,11 @@ public:
 		//! sub-domain id
 		size_t sub;
 
-		//! external box
+		//! external ghost box linked to this internal ghost box
 		size_t k;
+
+		//! combination
+		comb<dim> cmb;
 	};
 
 	/*! \brief It store the information about the external ghost box
@@ -1238,12 +1403,16 @@ public:
 	//! External ghost boxes in grid units
 	openfpm::vector<ep_box_grid> eg_box;
 
+	openfpm::vector<ep_box_grid> eg_box_tmp;
+
 	//! Local internal ghost boxes in grid units
 	openfpm::vector<i_lbox_grid> loc_ig_box;
 
 	//! Local external ghost boxes in grid units
 	openfpm::vector<e_lbox_grid> loc_eg_box;
 
+	openfpm::vector<e_lbox_grid> loc_eg_box_tmp;
+
 	/*! \brief It synchronize the ghost parts
 	 *
 	 * \tparam prp... Properties to synchronize
diff --git a/src/Grid/grid_dist_id_unit_test.hpp b/src/Grid/grid_dist_id_unit_test.hpp
index 728b786b..4c650b91 100644
--- a/src/Grid/grid_dist_id_unit_test.hpp
+++ b/src/Grid/grid_dist_id_unit_test.hpp
@@ -1482,6 +1482,7 @@ void Test3D_periodic(const Box<3,float> & domain, long int k)
 }
 
 #include "grid_dist_id_unit_test_ext_dom.hpp"
+#include "grid_dist_id_unit_test_unb_ghost.hpp"
 
 BOOST_AUTO_TEST_CASE( grid_dist_id_iterator_test_use)
 {
@@ -1614,6 +1615,28 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_periodic )
 	Test3D_periodic(domain3,k);
 }
 
+BOOST_AUTO_TEST_CASE( grid_dist_id_unbound_ghost )
+{
+	// Domain
+	Box<3,float> domain3({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	long int k = 32*32*32*create_vcluster().getProcessingUnits();
+	k = std::pow(k, 1/3.);
+
+	Test3D_unb_ghost(domain3,k);
+}
+
+BOOST_AUTO_TEST_CASE( grid_dist_id_unbound_ghost_periodic )
+{
+	// Domain
+	Box<3,float> domain3({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	long int k = 32*32*32*create_vcluster().getProcessingUnits();
+	k = std::pow(k, 1/3.);
+
+	Test3D_unb_ghost_periodic(domain3,k);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif
diff --git a/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp b/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp
new file mode 100644
index 00000000..3299c95a
--- /dev/null
+++ b/src/Grid/grid_dist_id_unit_test_unb_ghost.hpp
@@ -0,0 +1,255 @@
+/*
+ * grid_dist_id_unit_test_unb_ghost.hpp
+ *
+ *  Created on: Jul 11, 2016
+ *      Author: i-bird
+ */
+
+#ifndef SRC_GRID_GRID_DIST_ID_UNIT_TEST_UNB_GHOST_HPP_
+#define SRC_GRID_GRID_DIST_ID_UNIT_TEST_UNB_GHOST_HPP_
+
+void Test3D_unb_ghost(const Box<3,float> & domain, long int k)
+{
+	long int big_step = k / 30;
+	big_step = (big_step == 0)?1:big_step;
+	long int small_step = 21;
+
+	if (create_vcluster().getProcessingUnits() > 48)
+		return;
+
+	print_test( "Testing 3D grid unbound ghost k<=",k);
+
+	// 3D test
+	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
+	{
+		BOOST_TEST_CHECKPOINT( "Testing 3D grid k=" << k );
+
+		// grid size
+		size_t sz[3];
+		sz[0] = k;
+		sz[1] = k;
+		sz[2] = k;
+
+		// Ghost
+		Ghost<3,float> g(0.49);
+
+		// Distributed grid with id decomposition
+		grid_dist_id<3, float, scalar<float>, CartDecomposition<3,float>> g_dist(sz,domain,g);
+
+		g_dist.getDecomposition().write("no_bound_decomposition");
+
+		// check the consistency of the decomposition
+		bool val = g_dist.getDecomposition().check_consistency();
+		BOOST_REQUIRE_EQUAL(val,true);
+
+		// Grid sm
+		grid_sm<3,void> info(sz);
+
+		// get the domain iterator
+		size_t count = 0;
+
+		auto dom = g_dist.getDomainIterator();
+
+		while (dom.isNext())
+		{
+			auto key = dom.get();
+			auto key_g = g_dist.getGKey(key);
+
+			g_dist.template get<0>(key) = info.LinId(key_g);
+
+			// Count the point
+			count++;
+
+			++dom;
+		}
+
+		// Get the virtual cluster machine
+		Vcluster & vcl = g_dist.getVC();
+
+		// reduce
+		vcl.sum(count);
+		vcl.execute();
+
+		// Check
+		BOOST_REQUIRE_EQUAL(count,(size_t)k*k*k);
+
+		bool match = true;
+
+		auto dom2 = g_dist.getDomainIterator();
+
+		// check that the grid store the correct information
+		while (dom2.isNext())
+		{
+			auto key = dom2.get();
+			auto key_g = g_dist.getGKey(key);
+
+			match &= (g_dist.template get<0>(key) == info.LinId(key_g))?true:false;
+
+			++dom2;
+		}
+
+		BOOST_REQUIRE_EQUAL(match,true);
+
+		//! [Synchronize the ghost and check the information]
+
+		g_dist.template ghost_get<0>();
+
+		// check that the communication is correctly completed
+
+		auto domg = g_dist.getDomainGhostIterator();
+
+		// check that the grid with the ghost past store the correct information
+		while (domg.isNext())
+		{
+			auto key = domg.get();
+			auto key_g = g_dist.getGKey(key);
+
+			// In this case the boundary condition are non periodic
+			if (g_dist.isInside(key_g))
+			{
+				match &= (g_dist.template get<0>(key) == info.LinId(key_g))?true:false;
+			}
+
+			++domg;
+		}
+
+		BOOST_REQUIRE_EQUAL(match,true);
+
+		//! [Synchronize the ghost and check the information]
+	}
+}
+
+
+// Test grid periodic
+
+void Test3D_unb_ghost_periodic(const Box<3,float> & domain, long int k)
+{
+	typedef Point_test<float> p;
+
+	Vcluster & v_cl = create_vcluster();
+
+	if ( v_cl.getProcessingUnits() > 48 )
+		return;
+
+	long int big_step = k / 30;
+	big_step = (big_step == 0)?1:big_step;
+	long int small_step = 21;
+
+	print_test( "Testing grid periodic unbound ghost k<=",k);
+
+	// 3D test
+	for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
+	{
+		BOOST_TEST_CHECKPOINT( "Testing grid unbound ghost periodic k<=" << k );
+
+		// grid size
+		size_t sz[3];
+		sz[0] = k;
+		sz[1] = k;
+		sz[2] = k;
+
+		// Ghost
+		Ghost<3,float> g(0.1);
+
+		// periodicity
+		periodicity<3> pr = {{PERIODIC,PERIODIC,PERIODIC}};
+
+		// Distributed grid with id decomposition
+		grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g,pr);
+
+		// check the consistency of the decomposition
+		bool val = g_dist.getDecomposition().check_consistency();
+		BOOST_REQUIRE_EQUAL(val,true);
+
+		// Grid sm
+		grid_sm<3,void> info(sz);
+
+		size_t count = 0;
+
+		// Set to zero the full grid
+
+		auto dom1 = g_dist.getDomainGhostIterator();
+
+		while (dom1.isNext())
+		{
+			auto key = dom1.get();
+
+			g_dist.template get<0>(key) = -1;
+
+			++dom1;
+		}
+
+		auto dom = g_dist.getDomainIterator();
+
+		while (dom.isNext())
+		{
+			auto key = dom.get();
+			auto key_g = g_dist.getGKey(key);
+
+			g_dist.template get<0>(key) = info.LinId(key_g);
+
+			// Count the points
+			count++;
+
+			++dom;
+		}
+
+		// Get the virtual cluster machine
+		Vcluster & vcl = g_dist.getVC();
+
+		// reduce
+		vcl.sum(count);
+		vcl.execute();
+
+		// Check
+		BOOST_REQUIRE_EQUAL(count,(size_t)k*k*k);
+
+		// sync the ghosts
+//		g_dist.write("no_bound_before");
+		g_dist.ghost_get<0>();
+//		g_dist.write("no_bound_after");
+
+		bool match = true;
+
+		// Domain + Ghost iterator
+		auto dom_gi = g_dist.getDomainGhostIterator();
+
+		size_t out_cnt = 0;
+
+		while (dom_gi.isNext())
+		{
+			bool out_p = false;
+
+			auto key = dom_gi.get();
+			auto key_g = g_dist.getGKey(key);
+
+			// transform the key to be periodic
+			for (size_t i = 0 ; i < 3 ; i++)
+			{
+				if (key_g.get(i) < 0)
+				{key_g.set_d(i,key_g.get(i) + k);out_p = true;}
+				else if (key_g.get(i) >= k)
+				{key_g.set_d(i,key_g.get(i) - k);out_p = true;}
+			}
+
+			if (g_dist.template get<0>(key) != -1 && out_p == true)
+				out_cnt++;
+
+			if ( /*g_dist.template get<0>(key) != -1 &&*/ info.LinId(key_g) != g_dist.template get<0>(key) )
+				match &= false;
+
+			++dom_gi;
+		}
+
+		BOOST_REQUIRE_EQUAL(match, true);
+		if (k > 83)
+		{
+			vcl.sum(out_cnt);
+			vcl.execute();
+			BOOST_REQUIRE(out_cnt != 0ul);
+		}
+	}
+}
+
+
+#endif /* SRC_GRID_GRID_DIST_ID_UNIT_TEST_UNB_GHOST_HPP_ */
diff --git a/src/dec_optimizer.hpp b/src/dec_optimizer.hpp
index 308468d7..737377a0 100644
--- a/src/dec_optimizer.hpp
+++ b/src/dec_optimizer.hpp
@@ -2,6 +2,7 @@
 #define DEC_OPTIMIZER_HPP
 
 #include "Grid/iterators/grid_key_dx_iterator_sub.hpp"
+#include "Grid/iterators/grid_skin_iterator.hpp"
 
 /*! \brief this class represent a wavefront of dimension dim
  *
@@ -237,7 +238,7 @@ private:
 	 * \param bc Boundary conditions
 	 *
 	 */
-	template<unsigned int p_sub, unsigned int p_id> void add_to_queue(openfpm::vector<size_t> & domains, openfpm::vector<wavefront<dim>> & v_w, Graph & graph,  std::vector<comb<dim>> & w_comb, long int pr_id, openfpm::vector< openfpm::vector<size_t> > & box_nn_processor, const size_t(& bc)[dim])
+	template<unsigned int p_sub, unsigned int p_id> void add_to_queue(openfpm::vector<size_t> & domains, openfpm::vector<wavefront<dim>> & v_w, Graph & graph,  std::vector<comb<dim>> & w_comb, long int pr_id, const size_t(& bc)[dim])
 	{
 		// create a new queue
 		openfpm::vector<size_t> domains_new;
@@ -284,13 +285,6 @@ private:
 				// Get the processor id of the sub-sub-domain
 				long int pp_id = graph.vertex(gh.LinId(gk)).template get<p_id>();
 
-				if (pp_id != pr_id)
-				{
-					// this box is contiguous to other processors
-
-					box_nn_processor.get(box_nn_processor.size()-1).add(pp_id);
-				}
-
 				// if the sub-sub-domain is not assigned
 				if (pid < 0)
 				{
@@ -310,11 +304,6 @@ private:
 			}
 		}
 
-		// sort and make it unique the numbers in processor list
-	    std::sort(box_nn_processor.get(box_nn_processor.size()-1).begin(), box_nn_processor.get(box_nn_processor.size()-1).end());
-	    auto last = std::unique(box_nn_processor.get(box_nn_processor.size()-1).begin(), box_nn_processor.get(box_nn_processor.size()-1).end());
-	    box_nn_processor.get(box_nn_processor.size()-1).erase(last, box_nn_processor.get(box_nn_processor.size()-1).end());
-
 		// copy the new queue to the old one (it not copied, C++11 move semantic)
 		domains.swap(domains_new);
 	}
@@ -550,6 +539,7 @@ private:
 	 * \param p_id Processor id (if p_id == -1 the optimization is done for all the processors)
 	 * \param list of sub-domain boxes produced by the algorithm
 	 * \param box_nn_processor for each box it list all the neighborhood processor
+	 * \param ghe Ghost extension in sub-sub-domain units in each direction
 	 * \param bc Boundary condition
 	 * \param init_sub_id when true p_sub property is initial set to -1 [default true]
 	 * \param sub_id starting sub_id enumeration [default 0]
@@ -557,11 +547,14 @@ private:
 	 * \return last assigned sub-id
 	 *
 	 */
-	template <unsigned int p_sub, unsigned int p_id> size_t optimize(grid_key_dx<dim> & start_p, Graph & graph, long int pr_id, openfpm::vector<Box<dim,size_t>> & lb, openfpm::vector< openfpm::vector<size_t> > & box_nn_processor, const size_t (& bc)[dim], bool init_sub_id = true, size_t sub_id = 0)
+	template <unsigned int p_sub, unsigned int p_id> size_t optimize(grid_key_dx<dim> & start_p, Graph & graph, long int pr_id, openfpm::vector<Box<dim,size_t>> & lb, openfpm::vector< openfpm::vector<size_t> > & box_nn_processor , const Ghost<dim,long int> & ghe ,const size_t (& bc)[dim], bool init_sub_id = true, size_t sub_id = 0)
 	{
 		// queue
 		openfpm::vector<size_t> v_q;
 
+		// box list 2
+		openfpm::vector< openfpm::vector<size_t> > box_nn_processor2;
+
 		// Create an hyper-cube
 		HyperCube<dim> hyp;
 
@@ -590,9 +583,6 @@ private:
 			// Initialize the wavefronts from the domain start_p
 			InitializeWavefront(start_p,v_w);
 
-			// Create a list for the box
-			box_nn_processor.add();
-
 			// Create the biggest box containing the domain
 			expand_from_point<p_sub,p_id>(v_q.get(0),graph,box,v_w,w_comb);
 
@@ -603,15 +593,57 @@ private:
 			fill_domain<p_sub>(graph,box,sub_id);
 
 			// add the surrounding sub-domain to the queue
-			add_to_queue<p_sub,p_id>(v_q,v_w,graph,w_comb,pr_id,box_nn_processor,bc);
+			add_to_queue<p_sub,p_id>(v_q,v_w,graph,w_comb,pr_id,bc);
 
 			// increment the sub_id
 			sub_id++;
 		}
 
+		// Construct box box_nn_processor from the constructed domain
+		construct_box_nn_processor<p_id>(graph,box_nn_processor,lb,ghe,bc,pr_id);
+
 		return sub_id;
 	}
 
+	/*! \brief Construct the sub-domain processor list
+	 *
+	 * Each entry is a sub-domain, the list of numbers indicate the neighborhood processors
+	 *
+	 * \brief box_nn_processor
+	 *
+	 */
+	template<unsigned int p_id> void construct_box_nn_processor(Graph & graph, openfpm::vector< openfpm::vector<size_t> > & box_nn_processor, const openfpm::vector<Box<dim,size_t>> & subs, const Ghost<dim,long int> & ghe, const size_t (& bc)[dim], long int pr_id)
+	{
+		std::unordered_map<size_t,size_t> map;
+
+		for (size_t i = 0 ; i < subs.size() ; i++)
+		{
+			map.clear();
+			Box<dim,size_t> sub = subs.get(i);
+			sub.enlarge(ghe);
+			grid_skin_iterator_bc<dim> gsi(gh,subs.get(i),sub,bc);
+
+			while (gsi.isNext())
+			{
+				auto key = gsi.get();
+
+				size_t pp_id = graph.vertex(gh.LinId(key)).template get<p_id>();
+				if (pr_id != (long int)pp_id)
+					map[pp_id] = pp_id;
+
+				++gsi;
+			}
+
+			// Add the keys to box_nn_processors
+
+			box_nn_processor.add();
+			for ( auto it = map.begin(); it != map.end(); ++it )
+			{
+				box_nn_processor.last().add(it->first);
+			}
+		}
+	}
+
 public:
 
 	/*! \brief Constructor
@@ -641,7 +673,7 @@ public:
 	 * \param graph we are processing
 	 *
 	 */
-	template <unsigned int p_sub, unsigned int p_id> void optimize(grid_key_dx<dim> & start_p, Graph & graph, const size_t (& bc)[dim])
+	template <unsigned int p_sub, unsigned int p_id> void optimize(grid_key_dx<dim> & start_p, Graph & graph, const Ghost<dim,long int> & ghe , const size_t (& bc)[dim])
 	{
 		// temporal vector
 		openfpm::vector<Box<dim,size_t>> tmp;
@@ -650,7 +682,7 @@ public:
 		openfpm::vector< openfpm::vector<size_t> > box_nn_processor;
 
 		// optimize
-		optimize<p_sub,p_id>(start_p,graph,-1,tmp, box_nn_processor,bc);
+		optimize<p_sub,p_id>(start_p,graph,-1,tmp, box_nn_processor,ghe,bc);
 	}
 
 	/*! \brief optimize the graph
@@ -667,7 +699,7 @@ public:
 	 * \param list of sub-domain boxes
 	 *
 	 */
-	template <unsigned int p_sub, unsigned int p_id> void optimize(Graph & graph, long int pr_id, openfpm::vector<Box<dim,size_t>> & lb, openfpm::vector< openfpm::vector<size_t> > & box_nn_processor, const size_t (& bc)[dim])
+	template <unsigned int p_sub, unsigned int p_id> void optimize(Graph & graph, long int pr_id, openfpm::vector<Box<dim,size_t>> & lb, openfpm::vector< openfpm::vector<size_t> > & box_nn_processor, const Ghost<dim,long int> & ghe, const size_t (& bc)[dim])
 	{
 		grid_key_dx<dim> key_seed;
 		key_seed.zero();
@@ -675,7 +707,7 @@ public:
 		// if processor is -1 call optimize with -1 to do on all processors and exit
 		if (pr_id == -1)
 		{
-			optimize<p_sub,p_id>(key_seed,graph,pr_id,lb,box_nn_processor,bc);
+			optimize<p_sub,p_id>(key_seed,graph,pr_id,lb,box_nn_processor,ghe,bc);
 			return;
 		}
 
@@ -689,7 +721,7 @@ public:
 		while (key_seed.isValid())
 		{
 			// optimize
-			sub_id = optimize<p_sub,p_id>(key_seed,graph,pr_id,lb,box_nn_processor,bc,false,sub_id);
+			sub_id = optimize<p_sub,p_id>(key_seed,graph,pr_id,lb,box_nn_processor,ghe,bc,false,sub_id);
 
 			// new seed
 			key_seed = search_seed<p_id,p_sub>(graph,pr_id);
diff --git a/src/dec_optimizer_unit_test.hpp b/src/dec_optimizer_unit_test.hpp
index 6babf566..deaa903e 100644
--- a/src/dec_optimizer_unit_test.hpp
+++ b/src/dec_optimizer_unit_test.hpp
@@ -50,8 +50,10 @@ BOOST_AUTO_TEST_CASE( dec_optimizer_test_use_np)
 	// optimize
 	dec_optimizer<3,Graph_CSR<nm_v,nm_e>> d_o(g,sz);
 
+	Ghost<3,size_t> ghe(1);
+
 	grid_key_dx<3> keyZero(0,0,0);
-	d_o.optimize<nm_v::sub_id,nm_v::id>(keyZero,g,bc);
+	d_o.optimize<nm_v::sub_id,nm_v::id>(keyZero,g,ghe,bc);
 }
 
 BOOST_AUTO_TEST_CASE( dec_optimizer_test_use_p)
@@ -111,8 +113,10 @@ BOOST_AUTO_TEST_CASE( dec_optimizer_test_use_p)
 	// For each sub-domain check the neighborhood processors
 	openfpm::vector< openfpm::vector<size_t> > box_nn_processor;
 
+	Ghost<3,size_t> ghe(1);
+
 	// gp,p_id,loc_box,box_nn_processor,bc
-	d_o.optimize<nm_v::sub_id,nm_v::id>(g,-1,dec_o,box_nn_processor,bc);
+	d_o.optimize<nm_v::sub_id,nm_v::id>(g,-1,dec_o,box_nn_processor,ghe,bc);
 
 	BOOST_REQUIRE_EQUAL(box_nn_processor.size(),8ul);
 
@@ -187,7 +191,8 @@ BOOST_AUTO_TEST_CASE( dec_optimizer_disconnected_subdomains_np)
 	//! for each sub-domain, contain the list of the neighborhood processors
 	openfpm::vector<openfpm::vector<long unsigned int> > box_nn_processor;
 
-	d_o.optimize<nm_v::sub_id, nm_v::proc_id>(g, vcl.getProcessUnitID(), loc_box, box_nn_processor,bc);
+	Ghost<2,size_t> ghe(1);
+	d_o.optimize<nm_v::sub_id, nm_v::proc_id>(g, vcl.getProcessUnitID(), loc_box, box_nn_processor,ghe,bc);
 
 	std::stringstream str_g;
 	str_g << "dec_optimizer_disc_graph" << vcl.getProcessUnitID() << ".vtk";
-- 
GitLab