From d507b33eb55d927f409aa68dcd2dbf9a02fa732b Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Tue, 16 Feb 2021 14:14:23 +0100 Subject: [PATCH] Adding weak scaling test --- .../Makefile | 45 ++ .../main.cu | 504 ++++++++++++++++++ src/Grid/grid_dist_id_comm.hpp | 6 +- 3 files changed, 552 insertions(+), 3 deletions(-) create mode 100644 example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile create mode 100644 example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile b/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile new file mode 100644 index 000000000..9085ea424 --- /dev/null +++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile @@ -0,0 +1,45 @@ +include ../../example.mk + +### internally the example disable with the preprocessor its code if not compiled with nvcc +CUDA_CC= +CUDA_CC_LINK= +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ +else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + endif +endif + +gray_scott_sparse_gpu_test: OPT += -DTEST_RUN +gray_scott_sparse_gpu_test: gray_scott_sparse_gpu + +CC=mpic++ + +LDIR = + +OBJ = main.o + +%.o: %.cu + $(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +gray_scott_sparse_gpu: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS) + +all: gray_scott_sparse_gpu + +run: gray_scott_sparse_gpu_test + mpirun -np 4 ./gray_scott_sparse_gpu + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_gpu + diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu b/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu new file mode 100644 index 000000000..0e683a8d6 --- /dev/null +++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu @@ -0,0 +1,504 @@ +#include "Decomposition/Distribution/BoxDistribution.hpp" +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU + * + * [TOC] + * + * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu + * + * In figure is the final solution of the problem + * + * \htmlonly + * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/> + * \endhtmlonly + * + * More or less this example is the adaptation of the dense example in 3D + * + * \see \ref Grid_3_gs_3D + * + * # Initializetion + * + * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D) + * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is + * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted. + * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k + * + * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters + * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max + * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points + * + * The function can also called with a specified range + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub + * + * # Update + * + * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve + * two properties + * + * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen. + * + * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the + * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the + * last is the extension of the stencil. In this case we use 1. + * + * The lambda function is defined as + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda + * + * and used in the body loop + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body + * + */ + +#ifdef __NVCC__ + +constexpr int U = 0; +constexpr int V = 1; + +constexpr int U_next = 2; +constexpr int V_next = 3; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec; + +typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>,CudaMemory, Dec> SparseGridType; + +void init(SparseGridType & grid, Box<3,float> & domain, size_t (& div)[3]) +{ + //! \cond [create points] \endcond + + double spacing_x = grid.spacing(0); + double spacing_y = grid.spacing(1); + double spacing_z = grid.spacing(2); + + typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT; + + // Get the processor domain in continuos + + for (int i = 0 ; i < div[0] ; i++) + { + for (int j = 0 ; j < div[1] ; j++) + { + for (int k = 0 ; k < div[2] ; k++) + { + Point<3,double> p({0.5+i*1.0,0.5+j*1.0,0.5+k*1.0}); + Sphere<3,double> sph(p,0.3); + + Box<3,size_t> bx; + + for (int s = 0 ; s < 3 ; s++) + { + bx.setLow(s,(size_t)((sph.center(i) - 0.31)/grid.spacing(i))); + bx.setHigh(s,(size_t)((sph.center(i) + 0.31)/grid.spacing(i))); + } + + grid.addPoints([spacing_x,spacing_y,spacing_z,sph] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + + // Check if the point is in the domain + if (sph.isInside(pc) ) + {return true;} + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + } + + for (int i = 0 ; i < div[0] ; i++) + { + for (int j = 0 ; j < div[1] ; j++) + { + Point<3,double> u({0.0,0.0,1.0}); + Point<3,double> c({0.5+i,0.5+j,0.0}); + + Box<3,size_t> bx; + + bx.setLow(0,(0.4+i)/spacing_x); + bx.setHigh(0,(0.6+i)/spacing_x); + + bx.setLow(1,(0.4+j)/spacing_y); + bx.setHigh(1,(0.6+j)/spacing_y); + + bx.setLow(2,0); + bx.setHigh(2,(size_t)grid.size(2)); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + for (int i = 0 ; i < div[0] ; i++) + { + for (int k = 0 ; k < div[2] ; k++) + { + Point<3,double> u({0.0,1.0,0.0}); + Point<3,double> c({0.5+i,0.0,0.5+k}); + + Box<3,size_t> bx; + + bx.setLow(0,(0.4+i)/spacing_x); + bx.setHigh(0,(0.6+i)/spacing_x); + + bx.setLow(2,(0.4+k)/spacing_z); + bx.setHigh(2,(0.6+k)/spacing_z); + + bx.setLow(1,0); + bx.setHigh(1,(size_t)grid.size(1)); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + for (int j = 0 ; j < div[1] ; j++) + { + for (int k = 0 ; k < div[2] ; k++) + { + Point<3,double> u({1.0,0.0,0.0}); + Point<3,double> c({0.0,0.5+j,0.5+k}); + + Box<3,size_t> bx; + + bx.setLow(1,(0.4+j)/spacing_y); + bx.setHigh(1,(0.6+j)/spacing_y); + + bx.setLow(2,(0.4+k)/spacing_z); + bx.setHigh(2,(0.6+k)/spacing_z); + + bx.setLow(0,0); + bx.setHigh(0,(size_t)grid.size(0)); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + //! \cond [create points] \endcond + + long int x_start = grid.size(0)*0.4f/domain.getHigh(0); + long int y_start = grid.size(1)*0.4f/domain.getHigh(1); + long int z_start = grid.size(1)*0.4f/domain.getHigh(2); + + long int x_stop = grid.size(0)*0.6f/domain.getHigh(0); + long int y_stop = grid.size(1)*0.6f/domain.getHigh(1); + long int z_stop = grid.size(1)*0.6f/domain.getHigh(2); + + //! \cond [create points sub] \endcond + + grid_key_dx<3> start({x_start,y_start,z_start}); + grid_key_dx<3> stop ({x_stop,y_stop,z_stop}); + + grid.addPoints(start,stop,[] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 0.5; + data.template get<V>() = 0.24; + } + ); + + grid.template flush<smin_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + + //! \cond [create points sub] \endcond +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // First we check which type of decomposition BoxDistritubion prodice + auto & v_cl = create_vcluster(); + + openfpm::vector<int> facts; + getPrimeFactors(v_cl.size(),facts); + + size_t div[3]; + + for (int i = 0 ; i < 3 ; i++) + {div[i] = 1;} + + for (int i = 0 ; i < facts.size() ; i++) + {div[i % 3] *= facts.get(i);} + + grid_sm<3,void> gdist(div); + + // domain + Box<3,float> domain({0.0,0.0,0.0},{div[0]*1.0f,div[1]*1.0f,div[2]*1.0f}); + + // grid size + size_t sz[3] = {64*div[0],64*div[1],64*div[2]}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + float deltaT = 0.25; + + // Diffusion constant for specie U + float du = 2*1e-5; + + // Diffusion constant for specie V + float dv = 1*1e-5; + + // Number of timesteps +#ifdef TEST_RUN + size_t timeSteps = 300; +#else + size_t timeSteps = 15000; +#endif + + // K and F (Physical constant in the equation) + float K = 0.053; + float F = 0.014; + + SparseGridType grid(sz,domain,g,bc,0,gdist); + + // spacing of the grid on x and y + float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain,div); + + grid.deviceToHost<U,V,U_next,V_next>(); + grid.write("final"); + + openfpm_finalize(); + return 0; + + // sync the ghost + grid.template ghost_get<U,V>(RUN_ON_DEVICE); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + float uFactor = deltaT * du/(spacing[x]*spacing[x]); + float vFactor = deltaT * dv/(spacing[x]*spacing[x]); + + timer tot_sim; + tot_sim.start(); + + for (size_t i = 0; i < timeSteps ; ++i) + { + if (v_cl.rank() == 0) + {std::cout << "STEP: " << i << std::endl;} +/* if (i % 300 == 0) + { + std::cout << "STEP: " << i << std::endl; + grid.write_frame("out",i,VTK_WRITER); + }*/ + + //! \cond [stencil get and use] \endcond + + typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType; + + //! \cond [lambda] \endcond + + auto func = [uFactor,vFactor,deltaT,F,K] __device__ (float & u_out, float & v_out, + CpBlockType & u, CpBlockType & v, + int i, int j, int k){ + + float uc = u(i,j,k); + float vc = v(i,j,k); + + u_out = uc + uFactor *(u(i-1,j,k) + u(i+1,j,k) + + u(i,j-1,k) + u(i,j+1,k) + + u(i,j,k-1) + u(i,j,k+1) - 6.0f*uc) - deltaT * uc*vc*vc + - deltaT * F * (uc - 1.0f); + + + v_out = vc + vFactor *(v(i-1,j,k) + v(i+1,j,k) + + v(i,j+1,k) + v(i,j-1,k) + + v(i,j,k-1) + v(i,j,k+1) - 6.0f*vc) + deltaT * uc*vc*vc + - deltaT * (F+K) * vc; + }; + + //! \cond [lambda] \endcond + + //! \cond [body] \endcond + + if (i % 2 == 0) + { + grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + // After copy we synchronize again the ghost part U and V + + grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING); + } + else + { + grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + // After copy we synchronize again the ghost part U and V + grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING); + } + + //! \cond [body] \endcond + + // Every 500 time step we output the configuration for + // visualization +// if (i % 500 == 0) +// { +// grid.save("output_" + std::to_string(count)); +// count++; +// } + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + grid.deviceToHost<U,V,U_next,V_next>(); + grid.write("final"); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d/main.cpp + * + */ +} + +#else + +int main(int argc, char* argv[]) +{ + return 0; +} + +#endif + diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp index c30f2b97a..061492be2 100644 --- a/src/Grid/grid_dist_id_comm.hpp +++ b/src/Grid/grid_dist_id_comm.hpp @@ -444,9 +444,9 @@ class grid_dist_id_comm if (send_prc_queue.size() == 0) { - v_cl.sendrecvMultipleMessagesNBX(send_prc_queue.size(),NULL, - NULL,NULL, - receive_dynamic,this); + v_cl.sendrecvMultipleMessagesNBX(send_prc_queue.size(),NULL, + NULL,NULL, + receive_dynamic,this); } else { -- GitLab