From ec49d1d9c9d71a198de9f7a0d2e1fad664ff0cae Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Tue, 28 Nov 2017 23:00:54 +0100
Subject: [PATCH] Added grayscott vectorization

---
 .../3_gray_scott_3d_vectorization/Makefile    |  27 ++
 .../3_gray_scott_3d_vectorization/config.cfg  |   2 +
 .../3_gray_scott_3d_vectorization/main.cpp    | 400 ++++++++++++++++++
 .../update_new.f90                            |  48 +++
 4 files changed, 477 insertions(+)
 create mode 100644 example/Grid/3_gray_scott_3d_vectorization/Makefile
 create mode 100644 example/Grid/3_gray_scott_3d_vectorization/config.cfg
 create mode 100644 example/Grid/3_gray_scott_3d_vectorization/main.cpp
 create mode 100644 example/Grid/3_gray_scott_3d_vectorization/update_new.f90

diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile
new file mode 100644
index 000000000..9488bea01
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile
@@ -0,0 +1,27 @@
+include ../../example.mk
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o update_new.o
+
+%.o: %.f90
+	mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore  -fimplicit-none  -mavx -O3 -c -g -o $@ $<
+
+%.o: %.cpp
+	$(CC) -O3 -mavx  -g -c --std=c++11 -Wno-ignored-attributes  -o  $@ $< $(INCLUDE_PATH) -I/home/i-bird/VC/include
+
+gray_scott: $(OBJ)
+	$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -L/home/i-bird/VC/lib  -lVc
+
+all: gray_scott
+
+run: all
+	mpirun -np 4 ./gray_scott
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core gray_scott
+
diff --git a/example/Grid/3_gray_scott_3d_vectorization/config.cfg b/example/Grid/3_gray_scott_3d_vectorization/config.cfg
new file mode 100644
index 000000000..1eecbac35
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/config.cfg
@@ -0,0 +1,2 @@
+[pack]
+files = main.cpp Makefile
diff --git a/example/Grid/3_gray_scott_3d_vectorization/main.cpp b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
new file mode 100644
index 000000000..6720d6d41
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/main.cpp
@@ -0,0 +1,400 @@
+#include "Grid/grid_dist_id.hpp"
+#include "data_type/aggregate.hpp"
+#include "timer.hpp"
+#include "Vc/Vc"
+
+/*!
+ *
+ * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+ *
+ * # Solving a gray scott-system in 3D # {#e3_gs_gray_scott_vector}
+ *
+ * This example is just an improved version of the previous 3D Gray scott example.
+ * In particular we do the following improvements we separate U and V in two grids
+ * in order to vectorize. Every loop now handle 4 double in case of AVX-256 and 2 double
+ * in case of SSE. We also avoid to use the function copy and we alternate the use of the
+ * fields New and Old. If at the first iteration we read from Old and we write on New in
+ * the second iteration we read from New and we write on Old. The last improvement is write
+ * on hdf5 rather that VTK. VTK writers are convenient but are slow for performances. HDF5
+ * files can be saved with **save()** reload with **load()** and after loading can be written
+ * on VTK with **write** this mean that HDF5 files can be easily converted into VTK in a second moment.
+ * Not only but because HDF5 files can be saved on multiple processors and reloaded on a different
+ * number of processors, you can use this method to stitch VTK files together.
+ *
+ *
+ * In figure is the final solution of the problem
+ *
+ * \htmlonly
+ * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/>
+ * \endhtmlonly
+ *
+ * \see \ref Grid_2_solve_eq
+ *
+ * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp constants
+ * 
+ */
+
+//! \cond [constants] \endcond
+
+//#define FORTRAN_UPDATE
+
+constexpr int x = 0;
+constexpr int y = 1;
+constexpr int z = 2;
+
+extern "C" void update_new(const int* lo, const int* hi,
+                  double* u, const int* ulo, const int* uhi,
+                  double* v, const int* vlo, const int* vhi,
+                  double* flu, const int* fulo, const int* fuhi,
+                  double* flv, const int* fvlo, const int* fvhi,
+                  const double * dt, const double * uFactor, const double * vFactor, const double * F,
+                  const double * K);
+
+
+//! \cond [constants] \endcond
+
+void init(grid_dist_id<3,double,aggregate<double> > & OldU,
+		  grid_dist_id<3,double,aggregate<double> > & OldV,
+		  grid_dist_id<3,double,aggregate<double> > & NewU,
+		  grid_dist_id<3,double,aggregate<double> > & NewV,
+		  Box<3,double> & domain)
+{
+	auto it = OldU.getDomainIterator();
+
+	while (it.isNext())
+	{
+		// Get the local grid key
+		auto key = it.get();
+
+		// Old values U and V
+		OldU.get(key) = 1.0;
+		OldV.get(key) = 0.0;
+
+		// Old values U and V
+		NewU.get(key) = 0.0;
+		NewV.get(key) = 0.0;
+
+		++it;
+	}
+
+	long int x_start = OldU.size(0)*1.55f/domain.getHigh(0);
+	long int y_start = OldU.size(1)*1.55f/domain.getHigh(1);
+	long int z_start = OldU.size(1)*1.55f/domain.getHigh(2);
+
+	long int x_stop = OldU.size(0)*1.85f/domain.getHigh(0);
+	long int y_stop = OldU.size(1)*1.85f/domain.getHigh(1);
+	long int z_stop = OldU.size(1)*1.85f/domain.getHigh(2);
+
+	grid_key_dx<3> start({x_start,y_start,z_start});
+	grid_key_dx<3> stop ({x_stop,y_stop,z_stop});
+	auto it_init = OldU.getSubDomainIterator(start,stop);
+
+	while (it_init.isNext())
+	{
+		auto key = it_init.get();
+
+        OldU.get(key) = 0.5 + (((double)std::rand())/RAND_MAX -0.5)/10.0;
+        OldV.get(key) = 0.25 + (((double)std::rand())/RAND_MAX -0.5)/20.0;
+
+		++it_init;
+	}
+}
+
+
+//! \cond [vectorization] \endcond
+
+void step(grid_dist_id<3, double, aggregate<double>> & OldU,
+		  grid_dist_id<3, double, aggregate<double>> & OldV,
+		  grid_dist_id<3, double, aggregate<double>> & NewU,
+		  grid_dist_id<3, double, aggregate<double>> & NewV,
+		  grid_key_dx<3> (& star_stencil_3D)[7],
+		  Vc::double_v uFactor, Vc::double_v vFactor, double deltaT, double F, double K)
+{
+#ifndef FORTRAN_UPDATE
+	for (size_t i = 0 ; i < OldU.getN_loc_grid() ; i++)
+	{
+		auto & U_old = OldU.get_loc_grid(i);
+		auto & V_old = OldV.get_loc_grid(i);
+
+		auto & U_new = NewU.get_loc_grid(i);
+		auto & V_new = NewV.get_loc_grid(i);
+
+		auto it = OldU.get_loc_grid_iterator_stencil(i,star_stencil_3D);
+
+		while (it.isNext())
+		{
+			// center point
+			auto Cp = it.getStencil<0>();
+
+			// plus,minus X,Y,Z
+			auto mx = it.getStencil<1>();
+			auto px = it.getStencil<2>();
+			auto my = it.getStencil<3>();
+			auto py = it.getStencil<4>();
+			auto mz = it.getStencil<5>();
+			auto pz = it.getStencil<6>();
+
+			//
+			Vc::double_v u_c(&U_old.get<0>(Cp),Vc::Unaligned);
+			Vc::double_v u_mz(&U_old.get<0>(mz),Vc::Unaligned);
+			Vc::double_v u_pz(&U_old.get<0>(pz),Vc::Unaligned);
+			Vc::double_v u_my(&U_old.get<0>(my),Vc::Unaligned);
+			Vc::double_v u_py(&U_old.get<0>(py),Vc::Unaligned);
+			Vc::double_v u_mx(&U_old.get<0>(mx),Vc::Unaligned);
+			Vc::double_v u_px(&U_old.get<0>(px),Vc::Unaligned);
+
+
+			Vc::double_v v_c(&V_old.get<0>(Cp),Vc::Unaligned);
+			Vc::double_v v_mz(&V_old.get<0>(mz),Vc::Unaligned);
+			Vc::double_v v_pz(&V_old.get<0>(pz),Vc::Unaligned);
+			Vc::double_v v_my(&V_old.get<0>(my),Vc::Unaligned);
+			Vc::double_v v_py(&V_old.get<0>(py),Vc::Unaligned);
+			Vc::double_v v_mx(&V_old.get<0>(mx),Vc::Unaligned);
+			Vc::double_v v_px(&V_old.get<0>(px),Vc::Unaligned);
+
+			Vc::double_v out1 = u_c + uFactor * (u_mz + u_pz +
+					                            u_my + u_py +
+												u_mx + u_px +
+											    - 6.0 * u_c) +
+												- deltaT * u_c * v_c * v_c
+												- deltaT * F * (u_c - 1.0);
+
+			Vc::double_v out2 = v_c + vFactor * (v_mz + v_pz +
+					               v_my + v_py +
+								   v_mx + v_px +
+								   - 6.0 * v_c ) +
+								   deltaT * u_c * v_c * v_c +
+								   - deltaT * (F+K) * v_c;
+
+			out1.store(&U_new.get<0>(Cp),Vc::Unaligned);
+			out2.store(&V_new.get<0>(Cp),Vc::Unaligned);
+
+			// Next point in the grid
+			it += Vc::double_v::Size;
+		}
+	}
+#else
+
+	double uFactor_s = uFactor[0];
+	double vFactor_s = vFactor[0];
+
+	auto & ginfo = OldU.getLocalGridsInfo();
+
+	for (size_t i = 0 ; i < OldU.getN_loc_grid() ; i++)
+	{
+		auto & U_old = OldU.get_loc_grid(i);
+		auto & V_old = OldV.get_loc_grid(i);
+
+		auto & U_new = NewU.get_loc_grid(i);
+		auto & V_new = NewV.get_loc_grid(i);
+
+		int lo[3] = {(int)ginfo.get(i).Dbox.getLow(0),(int)ginfo.get(i).Dbox.getLow(1),(int)ginfo.get(i).Dbox.getLow(2)};
+		int hi[3] = {(int)ginfo.get(i).Dbox.getHigh(0),(int)ginfo.get(i).Dbox.getHigh(1),(int)ginfo.get(i).Dbox.getHigh(2)};
+
+		int ulo[3] = {0,0,0};
+		int uhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+		int nulo[3] = {0,0,0};
+		int nuhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+
+		int vlo[3] = {0,0,0};
+		int vhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+		int nvlo[3] = {0,0,0};
+		int nvhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)};
+
+		update_new(lo,hi,
+				   (double *)U_old.getPointer(),ulo,uhi,
+				   (double *)V_old.getPointer(),vlo,vhi,
+				   (double *)U_new.getPointer(),nulo,nuhi,
+				   (double *)V_new.getPointer(),nulo,nvhi,
+				   &deltaT, &uFactor_s, &vFactor_s,&F,&K);
+	}
+
+#endif
+}
+
+//! \cond [vectorization] \endcond
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	// domain
+	Box<3,double> domain({0.0,0.0},{2.5,2.5,2.5});
+	
+	// grid size
+        size_t sz[3] = {128,128,128};
+
+	// Define periodicity of the grid
+	periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC};
+	
+	// Ghost in grid unit
+	Ghost<3,long int> g(1);
+	
+	// deltaT
+	double deltaT = 1;
+
+	// Diffusion constant for specie U
+	double du = 2*1e-5;
+
+	// Diffusion constant for specie V
+	double dv = 1*1e-5;
+
+	// Number of timesteps
+    size_t timeSteps = 5000;
+
+	// K and F (Physical constant in the equation)
+    double K = 0.053;
+    double F = 0.014;
+
+	//! \cond [init lib] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+	 *
+	 * Here we create 2 distributed grid in 3D Old and New splitting U and V in two different fields.
+	 *  In particular because we want that all the grids are distributed across processors in the same
+	 *   way we pass the decomposition of the first grid.
+	 *
+	 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp init grid
+	 *
+	 */
+
+	//! \cond [init grid] \endcond
+
+	grid_dist_id<3, double, aggregate<double>> OldU(sz,domain,g,bc);
+	grid_dist_id<3, double, aggregate<double>> OldV(OldU.getDecomposition(),sz,g);
+
+	OldU.getDecomposition().write("Test");
+
+	// New grid with the decomposition of the old grid
+    grid_dist_id<3, double, aggregate<double>> NewU(OldU.getDecomposition(),sz,g);
+    grid_dist_id<3, double, aggregate<double>> NewV(OldV.getDecomposition(),sz,g);
+	
+	// spacing of the grid on x and y
+	double spacing[3] = {OldU.spacing(0),OldU.spacing(1),OldU.spacing(2)};
+
+	init(OldU,OldV,NewU,NewV,domain);
+
+	//! \cond [init grid] \endcond
+
+	// sync the ghost
+	size_t count = 0;
+	OldU.template ghost_get<0>();
+	OldV.template ghost_get<0>();
+
+	// because we assume that spacing[x] == spacing[y] we use formula 2
+	// and we calculate the prefactor of Eq 2
+	Vc::double_v uFactor = deltaT * du/(spacing[x]*spacing[x]);
+	Vc::double_v vFactor = deltaT * dv/(spacing[x]*spacing[x]);
+
+	timer tot_sim;
+	tot_sim.start();
+
+	static grid_key_dx<3> star_stencil_3D[7] = {{0,0,0},
+                                         	    {0,0,-1},
+						    {0,0,1},
+						    {0,-1,0},
+						    {0,1,0},
+						    {-1,0,0},
+						    {1,0,0}};
+
+	for (size_t i = 0; i < timeSteps; ++i)
+	{
+		if (i % 300 == 0)
+			std::cout << "STEP: " << i << std::endl;
+
+		/*!
+		 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+		 *
+		 * Alternate New and Old field to run one step, switch between old and new if the iteration
+		 * is even or odd. The function step is nothing else than the the implementation of Gray-Scott
+		 * 3D in the previous example.
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp alternate
+		 *
+		 *
+		 * The function has two main changes. The first is that we iterate
+		 * across local grids rather than a performance improvement is a convenient way
+		 * in case we have a lot of grid in this case we moved from 3 grid Old and New
+		 * to 4 grids. The second improvement is using Vc::double_v instead of double to
+		 * vectorize the code.
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp vectorization
+		 *
+		 */
+
+		//! \cond [alternate] \endcond
+
+		if (i % 2 == 0)
+		{
+			step(OldU,OldV,NewU,NewV,star_stencil_3D,uFactor,vFactor,deltaT,F,K);
+
+			NewU.ghost_get<0>();
+			NewV.ghost_get<0>();
+
+		}
+		else
+		{
+			step(NewU,NewV,OldU,OldV,star_stencil_3D,uFactor,vFactor,deltaT,F,K);
+
+			OldU.ghost_get<0>();
+			OldV.ghost_get<0>();
+		}
+
+		//! \cond [alternate] \endcond
+
+		/*!
+		 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+		 *
+		 * Instead of using the function **write** we use the function **save** to save on HDF5
+		 *
+		 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp save hdf5
+		 *
+		 */
+
+		//! \cond [save hdf5] \endcond
+
+		// Every 500 time step we output the configuration on hdf5
+		if (i % 500 == 0)
+		{
+			OldU.save("output_u_" + std::to_string(count));
+			OldV.save("output_v_" + std::to_string(count));
+			count++;
+		}
+
+		//! \cond [save hdf5] \endcond
+	}
+	
+	tot_sim.stop();
+	std::cout << "Total simulation: " << tot_sim.getwct() << std::endl;
+
+	//! \cond [time stepping] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+	 *
+	 * ## Finalize ##
+	 *
+	 * Deinitialize the library
+	 *
+	 * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp finalize
+	 *
+	 */
+
+	//! \cond [finalize] \endcond
+
+	openfpm_finalize();
+
+	//! \cond [finalize] \endcond
+
+	/*!
+	 * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization
+	 *
+	 * # Full code # {#code}
+	 *
+	 * \include Grid/3_gray_scott_3d_vectorization/main.cpp
+	 *
+	 */
+}
+
+
diff --git a/example/Grid/3_gray_scott_3d_vectorization/update_new.f90 b/example/Grid/3_gray_scott_3d_vectorization/update_new.f90
new file mode 100644
index 000000000..5728a019e
--- /dev/null
+++ b/example/Grid/3_gray_scott_3d_vectorization/update_new.f90
@@ -0,0 +1,48 @@
+subroutine update_new ( &
+                              lo, hi, &
+                               u, ulo, uhi, &
+                               v, vlo, vhi, &
+                               u_new, nulo, nuhi, &
+                               v_new, nvlo, nvhi, &
+                               dt, uFactor, vFactor,F,Kf) bind(C, name="update_new")
+
+  implicit none
+
+  integer, intent(in) :: lo(3), hi(3)
+  integer, intent(in) :: ulo(3), uhi(3)
+  integer, intent(in) :: vlo(3), vhi(3)
+  integer, intent(in) :: nulo(3), nuhi(3), nvlo(3), nvhi(3)
+  real*8, intent(in)    :: u  (ulo(1):uhi(1),ulo(2):uhi(2),ulo(3):uhi(3))
+  real*8, intent(in)    :: v  (vlo(1):vhi(1),vlo(2):vhi(2),vlo(3):vhi(3))
+  real*8, intent(inout) :: u_new( nulo(1): nuhi(1), nulo(2): nuhi(2), nulo(3): nuhi(3))
+  real*8, intent(inout) :: v_new( nvlo(1): nvhi(1), nvlo(2): nvhi(2), nvlo(3): nvhi(3))
+  real*8, intent(in) :: dt, F, Kf, uFactor, vFactor
+
+  ! local variables
+  integer i,j,k
+
+  ! x-fluxes
+  do       k = lo(3), hi(3)
+     do    j = lo(2), hi(2)
+        do i = lo(1), hi(1)
+           u_new(i,j,k) = u(i,j,k) + uFactor * ( u(i+1,j,k) + u(i-1,j,k) + &
+                                    u(i,j+1,k) + u(i,j-1,k) + &
+                                    u(i,j,k-1) + u(i,j,k+1) - &
+                                    6.0*u(i,j,k) ) - &
+                                    dt * u(i,j,k)*v(i,j,k)*v(i,j,k) - &
+                                    dt * F * (u(i,j,k) - 1.0)
+
+
+           v_new(i,j,k) = v(i,j,k) + vFactor * ( v(i+1,j,k) + v(i-1,j,k) + &
+                                    v(i,j+1,k) + v(i,j-1,k) + &
+                                    v(i,j,k-1) + v(i,j,k+1) - &
+                                    6.0*v(i,j,k) ) + &
+                                    dt * u(i,j,k)*v(i,j,k)*v(i,j,k) - &
+                                    dt * (F+Kf) * v(i,j,k)
+        end do
+     end do
+  end do
+
+
+end subroutine update_new
+
-- 
GitLab