From ce8eb6ed51e6906b27facdb6b9a4183f6654faea Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Wed, 20 Dec 2017 12:12:46 +0100
Subject: [PATCH] benchmark paper with C++ macro for fast stencil

---
 .../3_gray_scott_3d_vectorization/Makefile    |  2 +-
 .../3_gray_scott_3d_vectorization/main.cpp    | 65 ++++++++++++-------
 .../Iterators/grid_dist_id_iterator_sub.hpp   | 35 ++++++++++
 3 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile
index 68d37df8c..9488bea01 100644
--- a/example/Grid/3_gray_scott_3d_vectorization/Makefile
+++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile
@@ -10,7 +10,7 @@ OBJ = main.o update_new.o
 	mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore  -fimplicit-none  -mavx -O3 -c -g -o $@ $<
 
 %.o: %.cpp
-	$(CC) -O3 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH) -I/scratch/p_ppm/VCDEVEL/include
+	$(CC) -O3 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH) -I/home/i-bird/VC/include
 
 gray_scott: $(OBJ)
 	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -L/home/i-bird/VC/lib  -lVc
diff --git a/example/Grid/3_gray_scott_3d_vectorization/main.cpp b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
index d99e431b0..db8f5ea04 100644
--- a/example/Grid/3_gray_scott_3d_vectorization/main.cpp
+++ b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
@@ -36,7 +36,7 @@
 
 //! \cond [constants] \endcond
 
-#define FORTRAN_UPDATE
+//#define FORTRAN_UPDATE
 
 constexpr int x = 0;
 constexpr int y = 1;
@@ -111,18 +111,17 @@ void step(grid_dist_id<3, double, aggregate<double>> & OldU,
 		  Vc::double_v uFactor, Vc::double_v vFactor, double deltaT, double F, double K)
 {
 #ifndef FORTRAN_UPDATE
-	for (size_t i = 0 ; i < OldU.getN_loc_grid() ; i++)
-	{
-		auto & U_old = OldU.get_loc_grid(i);
-		auto & V_old = OldV.get_loc_grid(i);
 
-		auto & U_new = NewU.get_loc_grid(i);
-		auto & V_new = NewV.get_loc_grid(i);
+	//! \cond [cpp_update] \endcond
 
-		auto it = OldU.get_loc_grid_iterator_stencil(i,star_stencil_3D);
+	WHILE_M(OldU,star_stencil_3D)
+			auto & U_old = GET_GRID_M(OldU);
+			auto & V_old = GET_GRID_M(OldV);
+
+			auto & U_new = GET_GRID_M(NewU);
+			auto & V_new = GET_GRID_M(NewV);
+	ITERATE_3D_M
 
-		while (it.isNext())
-		{
 			// center point
 			auto Cp = it.getStencil<0>();
 
@@ -168,13 +167,14 @@ void step(grid_dist_id<3, double, aggregate<double>> & OldU,
 
 			out1.store(&U_new.get<0>(Cp),Vc::Unaligned);
 			out2.store(&V_new.get<0>(Cp),Vc::Unaligned);
+	END_LOOP_M
+
+	//! \cond [cpp_update] \endcond
 
-			// Next point in the grid
-			it += Vc::double_v::Size;
-		}
-	}
 #else
 
+	//! \cond [fort_update] \endcond
+
 	double uFactor_s = uFactor[0];
 	double vFactor_s = vFactor[0];
 
@@ -209,6 +209,8 @@ void step(grid_dist_id<3, double, aggregate<double>> & OldU,
 				   &deltaT, &uFactor_s, &vFactor_s,&F,&K);
 	}
 
+	//! \cond [fort_update] \endcond
+
 #endif
 }
 
@@ -306,19 +308,30 @@ int main(int argc, char* argv[])
 		 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
 		 *
 		 * Alternate New and Old field to run one step, switch between old and new if the iteration
-		 * is even or odd. The function step is nothing else than the the implementation of Gray-Scott
-		 * 3D in the previous example.
+		 * is even or odd. The function step is nothing else than the implementation of Gray-Scott
+		 * 3D in the previous example but in a more optimized way.
 		 *
 		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp alternate
 		 *
+		 * In this function we show two methods to optimize this function.
 		 *
-		 * The function has two main changes. The first is that we iterate
-		 * across local grids rather than a performance improvement is a convenient way
-		 * in case we have a lot of grid in this case we moved from 3 grid Old and New
-		 * to 4 grids. The second improvement is using Vc::double_v instead of double to
-		 * vectorize the code.
+		 * * We can use the macro **WHILE_M** passing the stencil definition, **ITERATE_3D** to define the loop,
+		 *  **END_LOOP** to close the loop, and use the function
+		 * function **getStencil<0>()** to retrieve the stencil points. Additionaly we can use Vc::double_v instead
+		 *  of double to vectorize the code. This method give the advantage to keep all the
+		 * code in C++.
 		 *
-		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp vectorization
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp cpp_update
+		 *
+		 * * Another possibility is to use FORTRAN. Because FORTRAN has better
+		 *  support for multi dimensional array another possibility is to process each local grid using
+		 *  FORTRAN, this also give us the opportunity to show hybrid code. We can switch between
+		 *   one and the other method commenting
+		 *  and uncommeting the line #define FORTRAN_UPDATE in the code.
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp fort_update
+		 *
+		 * \include Grid/3_gray_scott_3d_vectorization/update_new.f90
 		 *
 		 */
 
@@ -352,10 +365,10 @@ int main(int argc, char* argv[])
 
 		//! \cond [save hdf5] \endcond
 
-		// Every 500 time step we output the configuration on hdf5
+		// Every 2000 time step we output the configuration on hdf5
 		if (i % 2000 == 0)
 		{
-//			OldU.save("output_u_" + std::to_string(count));
+			OldU.save("output_u_" + std::to_string(count));
 			OldV.save("output_v_" + std::to_string(count));
 			count++;
 		}
@@ -364,7 +377,9 @@ int main(int argc, char* argv[])
 	}
 	
 	tot_sim.stop();
-	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	if (create_vcluster().rank() == 0)
+	{std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;}
 
 	// We frite the final configuration
 	OldV.write("final");
diff --git a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
index a1730e5dc..c249787f2 100644
--- a/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
+++ b/src/Grid/Iterators/grid_dist_id_iterator_sub.hpp
@@ -249,6 +249,41 @@ class grid_dist_iterator_sub
 	{
 		return stop;
 	}
+
+	/*! \brief Return the number of local grids
+	 *
+	 *
+	 */
+	inline size_t N_loc_grid()
+	{
+		return gList.size();
+	}
+
+	/*! \brief Return the component j of the starting point (P1) of the domain part
+	 *         for the local grid i
+	 *
+	 * \param i local grid
+	 * \param j dimension
+	 *
+	 *
+	 */
+	inline size_t loc_grid_info_start(size_t i,size_t j)
+	{
+		return gdb_ext.get(i).DBox.getLow(i);
+	}
+
+	/*! \brief Return the component j of the stop point (P2) of the domain part
+	 *         for the local grid i
+	 *
+	 * \param i local grid
+	 * \param j dimension
+	 *
+	 *
+	 */
+	inline size_t loc_grid_info_size(size_t i,size_t j)
+	{
+		return gdb_ext.get(i).GDBox.getHigh(i);
+	}
 };
 
 #endif /* SRC_GRID_GRID_DIST_ID_ITERATOR_SUB_HPP_ */
-- 
GitLab