From ec49d1d9c9d71a198de9f7a0d2e1fad664ff0cae Mon Sep 17 00:00:00 2001 From: Pietro Incardona <incardon@mpi-cbg.de> Date: Tue, 28 Nov 2017 23:00:54 +0100 Subject: [PATCH] Added grayscott vectorization --- .../3_gray_scott_3d_vectorization/Makefile | 27 ++ .../3_gray_scott_3d_vectorization/config.cfg | 2 + .../3_gray_scott_3d_vectorization/main.cpp | 400 ++++++++++++++++++ .../update_new.f90 | 48 +++ 4 files changed, 477 insertions(+) create mode 100644 example/Grid/3_gray_scott_3d_vectorization/Makefile create mode 100644 example/Grid/3_gray_scott_3d_vectorization/config.cfg create mode 100644 example/Grid/3_gray_scott_3d_vectorization/main.cpp create mode 100644 example/Grid/3_gray_scott_3d_vectorization/update_new.f90 diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile new file mode 100644 index 000000000..9488bea01 --- /dev/null +++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile @@ -0,0 +1,27 @@ +include ../../example.mk + +CC=mpic++ + +LDIR = + +OBJ = main.o update_new.o + +%.o: %.f90 + mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore -fimplicit-none -mavx -O3 -c -g -o $@ $< + +%.o: %.cpp + $(CC) -O3 -mavx -g -c --std=c++11 -Wno-ignored-attributes -o $@ $< $(INCLUDE_PATH) -I/home/i-bird/VC/include + +gray_scott: $(OBJ) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -L/home/i-bird/VC/lib -lVc + +all: gray_scott + +run: all + mpirun -np 4 ./gray_scott + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott + diff --git a/example/Grid/3_gray_scott_3d_vectorization/config.cfg b/example/Grid/3_gray_scott_3d_vectorization/config.cfg new file mode 100644 index 000000000..1eecbac35 --- /dev/null +++ b/example/Grid/3_gray_scott_3d_vectorization/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cpp Makefile diff --git a/example/Grid/3_gray_scott_3d_vectorization/main.cpp b/example/Grid/3_gray_scott_3d_vectorization/main.cpp new file mode 100644 index 000000000..6720d6d41 --- /dev/null +++ b/example/Grid/3_gray_scott_3d_vectorization/main.cpp @@ -0,0 +1,400 @@ +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" +#include "Vc/Vc" + +/*! + * + * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization + * + * # Solving a gray scott-system in 3D # {#e3_gs_gray_scott_vector} + * + * This example is just an improved version of the previous 3D Gray scott example. + * In particular we do the following improvements we separate U and V in two grids + * in order to vectorize. Every loop now handle 4 double in case of AVX-256 and 2 double + * in case of SSE. We also avoid to use the function copy and we alternate the use of the + * fields New and Old. If at the first iteration we read from Old and we write on New in + * the second iteration we read from New and we write on Old. The last improvement is write + * on hdf5 rather that VTK. VTK writers are convenient but are slow for performances. HDF5 + * files can be saved with **save()** reload with **load()** and after loading can be written + * on VTK with **write** this mean that HDF5 files can be easily converted into VTK in a second moment. + * Not only but because HDF5 files can be saved on multiple processors and reloaded on a different + * number of processors, you can use this method to stitch VTK files together. + * + * + * In figure is the final solution of the problem + * + * \htmlonly + * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/> + * \endhtmlonly + * + * \see \ref Grid_2_solve_eq + * + * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp constants + * + */ + +//! \cond [constants] \endcond + +//#define FORTRAN_UPDATE + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +extern "C" void update_new(const int* lo, const int* hi, + double* u, const int* ulo, const int* uhi, + double* v, const int* vlo, const int* vhi, + double* flu, const int* fulo, const int* fuhi, + double* flv, const int* fvlo, const int* fvhi, + const double * dt, const double * uFactor, const double * vFactor, const double * F, + const double * K); + + +//! \cond [constants] \endcond + +void init(grid_dist_id<3,double,aggregate<double> > & OldU, + grid_dist_id<3,double,aggregate<double> > & OldV, + grid_dist_id<3,double,aggregate<double> > & NewU, + grid_dist_id<3,double,aggregate<double> > & NewV, + Box<3,double> & domain) +{ + auto it = OldU.getDomainIterator(); + + while (it.isNext()) + { + // Get the local grid key + auto key = it.get(); + + // Old values U and V + OldU.get(key) = 1.0; + OldV.get(key) = 0.0; + + // Old values U and V + NewU.get(key) = 0.0; + NewV.get(key) = 0.0; + + ++it; + } + + long int x_start = OldU.size(0)*1.55f/domain.getHigh(0); + long int y_start = OldU.size(1)*1.55f/domain.getHigh(1); + long int z_start = OldU.size(1)*1.55f/domain.getHigh(2); + + long int x_stop = OldU.size(0)*1.85f/domain.getHigh(0); + long int y_stop = OldU.size(1)*1.85f/domain.getHigh(1); + long int z_stop = OldU.size(1)*1.85f/domain.getHigh(2); + + grid_key_dx<3> start({x_start,y_start,z_start}); + grid_key_dx<3> stop ({x_stop,y_stop,z_stop}); + auto it_init = OldU.getSubDomainIterator(start,stop); + + while (it_init.isNext()) + { + auto key = it_init.get(); + + OldU.get(key) = 0.5 + (((double)std::rand())/RAND_MAX -0.5)/10.0; + OldV.get(key) = 0.25 + (((double)std::rand())/RAND_MAX -0.5)/20.0; + + ++it_init; + } +} + + +//! \cond [vectorization] \endcond + +void step(grid_dist_id<3, double, aggregate<double>> & OldU, + grid_dist_id<3, double, aggregate<double>> & OldV, + grid_dist_id<3, double, aggregate<double>> & NewU, + grid_dist_id<3, double, aggregate<double>> & NewV, + grid_key_dx<3> (& star_stencil_3D)[7], + Vc::double_v uFactor, Vc::double_v vFactor, double deltaT, double F, double K) +{ +#ifndef FORTRAN_UPDATE + for (size_t i = 0 ; i < OldU.getN_loc_grid() ; i++) + { + auto & U_old = OldU.get_loc_grid(i); + auto & V_old = OldV.get_loc_grid(i); + + auto & U_new = NewU.get_loc_grid(i); + auto & V_new = NewV.get_loc_grid(i); + + auto it = OldU.get_loc_grid_iterator_stencil(i,star_stencil_3D); + + while (it.isNext()) + { + // center point + auto Cp = it.getStencil<0>(); + + // plus,minus X,Y,Z + auto mx = it.getStencil<1>(); + auto px = it.getStencil<2>(); + auto my = it.getStencil<3>(); + auto py = it.getStencil<4>(); + auto mz = it.getStencil<5>(); + auto pz = it.getStencil<6>(); + + // + Vc::double_v u_c(&U_old.get<0>(Cp),Vc::Unaligned); + Vc::double_v u_mz(&U_old.get<0>(mz),Vc::Unaligned); + Vc::double_v u_pz(&U_old.get<0>(pz),Vc::Unaligned); + Vc::double_v u_my(&U_old.get<0>(my),Vc::Unaligned); + Vc::double_v u_py(&U_old.get<0>(py),Vc::Unaligned); + Vc::double_v u_mx(&U_old.get<0>(mx),Vc::Unaligned); + Vc::double_v u_px(&U_old.get<0>(px),Vc::Unaligned); + + + Vc::double_v v_c(&V_old.get<0>(Cp),Vc::Unaligned); + Vc::double_v v_mz(&V_old.get<0>(mz),Vc::Unaligned); + Vc::double_v v_pz(&V_old.get<0>(pz),Vc::Unaligned); + Vc::double_v v_my(&V_old.get<0>(my),Vc::Unaligned); + Vc::double_v v_py(&V_old.get<0>(py),Vc::Unaligned); + Vc::double_v v_mx(&V_old.get<0>(mx),Vc::Unaligned); + Vc::double_v v_px(&V_old.get<0>(px),Vc::Unaligned); + + Vc::double_v out1 = u_c + uFactor * (u_mz + u_pz + + u_my + u_py + + u_mx + u_px + + - 6.0 * u_c) + + - deltaT * u_c * v_c * v_c + - deltaT * F * (u_c - 1.0); + + Vc::double_v out2 = v_c + vFactor * (v_mz + v_pz + + v_my + v_py + + v_mx + v_px + + - 6.0 * v_c ) + + deltaT * u_c * v_c * v_c + + - deltaT * (F+K) * v_c; + + out1.store(&U_new.get<0>(Cp),Vc::Unaligned); + out2.store(&V_new.get<0>(Cp),Vc::Unaligned); + + // Next point in the grid + it += Vc::double_v::Size; + } + } +#else + + double uFactor_s = uFactor[0]; + double vFactor_s = vFactor[0]; + + auto & ginfo = OldU.getLocalGridsInfo(); + + for (size_t i = 0 ; i < OldU.getN_loc_grid() ; i++) + { + auto & U_old = OldU.get_loc_grid(i); + auto & V_old = OldV.get_loc_grid(i); + + auto & U_new = NewU.get_loc_grid(i); + auto & V_new = NewV.get_loc_grid(i); + + int lo[3] = {(int)ginfo.get(i).Dbox.getLow(0),(int)ginfo.get(i).Dbox.getLow(1),(int)ginfo.get(i).Dbox.getLow(2)}; + int hi[3] = {(int)ginfo.get(i).Dbox.getHigh(0),(int)ginfo.get(i).Dbox.getHigh(1),(int)ginfo.get(i).Dbox.getHigh(2)}; + + int ulo[3] = {0,0,0}; + int uhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)}; + int nulo[3] = {0,0,0}; + int nuhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)}; + + int vlo[3] = {0,0,0}; + int vhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)}; + int nvlo[3] = {0,0,0}; + int nvhi[3] = {(int)ginfo.get(i).GDbox.getHigh(0),(int)ginfo.get(i).GDbox.getHigh(1),(int)ginfo.get(i).GDbox.getHigh(2)}; + + update_new(lo,hi, + (double *)U_old.getPointer(),ulo,uhi, + (double *)V_old.getPointer(),vlo,vhi, + (double *)U_new.getPointer(),nulo,nuhi, + (double *)V_new.getPointer(),nulo,nvhi, + &deltaT, &uFactor_s, &vFactor_s,&F,&K); + } + +#endif +} + +//! \cond [vectorization] \endcond + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,double> domain({0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {128,128,128}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + double deltaT = 1; + + // Diffusion constant for specie U + double du = 2*1e-5; + + // Diffusion constant for specie V + double dv = 1*1e-5; + + // Number of timesteps + size_t timeSteps = 5000; + + // K and F (Physical constant in the equation) + double K = 0.053; + double F = 0.014; + + //! \cond [init lib] \endcond + + /*! + * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization + * + * Here we create 2 distributed grid in 3D Old and New splitting U and V in two different fields. + * In particular because we want that all the grids are distributed across processors in the same + * way we pass the decomposition of the first grid. + * + * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp init grid + * + */ + + //! \cond [init grid] \endcond + + grid_dist_id<3, double, aggregate<double>> OldU(sz,domain,g,bc); + grid_dist_id<3, double, aggregate<double>> OldV(OldU.getDecomposition(),sz,g); + + OldU.getDecomposition().write("Test"); + + // New grid with the decomposition of the old grid + grid_dist_id<3, double, aggregate<double>> NewU(OldU.getDecomposition(),sz,g); + grid_dist_id<3, double, aggregate<double>> NewV(OldV.getDecomposition(),sz,g); + + // spacing of the grid on x and y + double spacing[3] = {OldU.spacing(0),OldU.spacing(1),OldU.spacing(2)}; + + init(OldU,OldV,NewU,NewV,domain); + + //! \cond [init grid] \endcond + + // sync the ghost + size_t count = 0; + OldU.template ghost_get<0>(); + OldV.template ghost_get<0>(); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + Vc::double_v uFactor = deltaT * du/(spacing[x]*spacing[x]); + Vc::double_v vFactor = deltaT * dv/(spacing[x]*spacing[x]); + + timer tot_sim; + tot_sim.start(); + + static grid_key_dx<3> star_stencil_3D[7] = {{0,0,0}, + {0,0,-1}, + {0,0,1}, + {0,-1,0}, + {0,1,0}, + {-1,0,0}, + {1,0,0}}; + + for (size_t i = 0; i < timeSteps; ++i) + { + if (i % 300 == 0) + std::cout << "STEP: " << i << std::endl; + + /*! + * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization + * + * Alternate New and Old field to run one step, switch between old and new if the iteration + * is even or odd. The function step is nothing else than the the implementation of Gray-Scott + * 3D in the previous example. + * + * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp alternate + * + * + * The function has two main changes. The first is that we iterate + * across local grids rather than a performance improvement is a convenient way + * in case we have a lot of grid in this case we moved from 3 grid Old and New + * to 4 grids. The second improvement is using Vc::double_v instead of double to + * vectorize the code. + * + * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp vectorization + * + */ + + //! \cond [alternate] \endcond + + if (i % 2 == 0) + { + step(OldU,OldV,NewU,NewV,star_stencil_3D,uFactor,vFactor,deltaT,F,K); + + NewU.ghost_get<0>(); + NewV.ghost_get<0>(); + + } + else + { + step(NewU,NewV,OldU,OldV,star_stencil_3D,uFactor,vFactor,deltaT,F,K); + + OldU.ghost_get<0>(); + OldV.ghost_get<0>(); + } + + //! \cond [alternate] \endcond + + /*! + * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization + * + * Instead of using the function **write** we use the function **save** to save on HDF5 + * + * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp save hdf5 + * + */ + + //! \cond [save hdf5] \endcond + + // Every 500 time step we output the configuration on hdf5 + if (i % 500 == 0) + { + OldU.save("output_u_" + std::to_string(count)); + OldV.save("output_v_" + std::to_string(count)); + count++; + } + + //! \cond [save hdf5] \endcond + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott_3d_vectorization/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_vector Gray Scott in 3D fast implementation with vectorization + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d_vectorization/main.cpp + * + */ +} + + diff --git a/example/Grid/3_gray_scott_3d_vectorization/update_new.f90 b/example/Grid/3_gray_scott_3d_vectorization/update_new.f90 new file mode 100644 index 000000000..5728a019e --- /dev/null +++ b/example/Grid/3_gray_scott_3d_vectorization/update_new.f90 @@ -0,0 +1,48 @@ +subroutine update_new ( & + lo, hi, & + u, ulo, uhi, & + v, vlo, vhi, & + u_new, nulo, nuhi, & + v_new, nvlo, nvhi, & + dt, uFactor, vFactor,F,Kf) bind(C, name="update_new") + + implicit none + + integer, intent(in) :: lo(3), hi(3) + integer, intent(in) :: ulo(3), uhi(3) + integer, intent(in) :: vlo(3), vhi(3) + integer, intent(in) :: nulo(3), nuhi(3), nvlo(3), nvhi(3) + real*8, intent(in) :: u (ulo(1):uhi(1),ulo(2):uhi(2),ulo(3):uhi(3)) + real*8, intent(in) :: v (vlo(1):vhi(1),vlo(2):vhi(2),vlo(3):vhi(3)) + real*8, intent(inout) :: u_new( nulo(1): nuhi(1), nulo(2): nuhi(2), nulo(3): nuhi(3)) + real*8, intent(inout) :: v_new( nvlo(1): nvhi(1), nvlo(2): nvhi(2), nvlo(3): nvhi(3)) + real*8, intent(in) :: dt, F, Kf, uFactor, vFactor + + ! local variables + integer i,j,k + + ! x-fluxes + do k = lo(3), hi(3) + do j = lo(2), hi(2) + do i = lo(1), hi(1) + u_new(i,j,k) = u(i,j,k) + uFactor * ( u(i+1,j,k) + u(i-1,j,k) + & + u(i,j+1,k) + u(i,j-1,k) + & + u(i,j,k-1) + u(i,j,k+1) - & + 6.0*u(i,j,k) ) - & + dt * u(i,j,k)*v(i,j,k)*v(i,j,k) - & + dt * F * (u(i,j,k) - 1.0) + + + v_new(i,j,k) = v(i,j,k) + vFactor * ( v(i+1,j,k) + v(i-1,j,k) + & + v(i,j+1,k) + v(i,j-1,k) + & + v(i,j,k-1) + v(i,j,k+1) - & + 6.0*v(i,j,k) ) + & + dt * u(i,j,k)*v(i,j,k)*v(i,j,k) - & + dt * (F+Kf) * v(i,j,k) + end do + end do + end do + + +end subroutine update_new + -- GitLab