diff --git a/example/Vector/0_simple/main.cpp b/example/Vector/0_simple/main.cpp index aea942c9ff33014ec4f9502acc96fd8b62183ac8..cb4059b2b9e83058adb008c589b98370743db236 100644 --- a/example/Vector/0_simple/main.cpp +++ b/example/Vector/0_simple/main.cpp @@ -19,6 +19,7 @@ * \subpage Vector_7_sph_dlb_opt * \subpage Vector_7_sph_dlb_gpu * \subpage Vector_7_sph_dlb_gpu_opt + * \subpage Vector_7_sph_dlb_gpu_more_opt * \subpage Vector_8_DEM * \subpage Vector_9_gpu_cuda_interop * diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu index 99f011623aaa8a3cceefb5677aa62a661bdf6e19..2d823d7f4178b32fdf5aa3e3e74896ea73074fb3 100644 --- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu @@ -1,15 +1,17 @@ -/*! \page Vector_7_sph_dlb_gpu_opt Vector 7 SPH Dam break simulation with Dynamic load balacing on Multi-GPU (optimized version) +/*! \page Vector_7_sph_dlb_gpu_more_opt Vector 7 SPH Dam break simulation with Dynamic load balacing on Multi-GPU (more optimized version) * * * [TOC] * * - * # SPH with Dynamic load Balancing on GPU (Optimized) # {#SPH_dlb_gpu_opt} + * # SPH with Dynamic load Balancing on GPU (More Optimized) # {#SPH_dlb_gpu_more_opt} * * * This example show the classical SPH Dam break simulation with load balancing and dynamic load balancing. The main difference with - * \ref{SPH_dlb} is that here we use GPU and 1.2 Millions particles. Simulate 1.5 second should be duable on a 1050Ti within a couple - * of hours. + * \ref{SPH_dlb_gpu_opt} is that here we use 2 kernel to calculate forces one for fluid and one for boundaries. Also we use the function + * get_indexes_by_type to get the indexes of the fluid and boundary particles and use these two set to launch two distinct kernel + * (one over fluid and one over boundary) to calculate forces and density change. set. Simulate 1.5 second should be duable on mobile + * 1050Ti in about 1 hour and 7 minutes * * \htmlonly * <a href="#" onclick="hide_show('vector-video-3')" >Simulation video 1</a><br> @@ -27,10 +29,19 @@ * \endhtmlonly * * - * ## GPU ## {#e7_sph_inclusion} + * ## get_indexes_by_type ## {#e7_sph_more_opt_gibt} * - * This example is an optimization of the example \ref SPH_dlb_gpu all the optimization operated on this example has been explained - * here \ref e3_md_gpu_opt so we will not go into the details + * This function can be used to get the indexes of a certain type on a particle set and save such indexes in an openfpm::vector<aggregate<unsigned int>> + * the constructed set of indices can be used to run a kernel on a specific set of particles. + * + * \snippet Vector/7_SPH_dlb_gpu_more_opt/main.cu get indexes by type + * + * the function get_indexes_by_type has three arguments the first is the vector of the properties of the particles. In + * this case because we use the sorted particles to calculate forces, so we have to get the indexes for the sorted + * particles with vd.getPropVectorSort(). In case we want to use the non sorted we use vd.getPropVector(). The second + * argument is the output containing the indexes of the particles types we want to get. Because the vector can contain + * ghost particles and real particles setting with the third argument we indicate we want only real particles and no ghost particles + * The last argument is the GPU context handle * * we report the full code here * @@ -480,19 +491,22 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N // Update the cell-list vd.updateCellList<type,rho,Pressure,velocity>(NN); + //! \cond [get indexes by type] \endcond + // get the particles fluid ids get_indexes_by_type<type,type_is_fluid>(vd.getPropVectorSort(),fluid_ids,vd.size_local(),vd.getVC().getmgpuContext()); // get the particles fluid ids get_indexes_by_type<type,type_is_border>(vd.getPropVectorSort(),border_ids,vd.size_local(),vd.getVC().getmgpuContext()); - auto part = fluid_ids.getGPUIterator(96); CUDA_LAUNCH(calc_forces_fluid_gpu,part,vd.toKernel_sorted(),fluid_ids.toKernel(),NN.toKernel(),W_dap,cbar); part = border_ids.getGPUIterator(96); CUDA_LAUNCH(calc_forces_border_gpu,part,vd.toKernel_sorted(),border_ids.toKernel(),NN.toKernel(),W_dap,cbar); + //! \cond [get indexes by type] \endcond + vd.merge_sort<force,drho,red>(NN); max_visc = reduce_local<red,_max_>(vd); diff --git a/script/discover_package_manager b/script/discover_package_manager index f8a9ebfd39b58867a4ebfe4a768bcb4365e0ae7c..f0002597ccf14fc91076a1a4ef50604e23d9b814 100755 --- a/script/discover_package_manager +++ b/script/discover_package_manager @@ -15,16 +15,6 @@ if [ x"$1" == x"osx" ]; then discover_package_ret="sudo brew" echo 'Package Manager: '"$discover_package_manager_ret" return - else - if [ ! -w $brew_idir ]; then - echo -e "\033[43;30;1;5mWARNING: \033[0m $brew_idir is not writtable, brew require that $brew_idir is writtable and $brew_idir/bin is in your PATH, otherwise it will be not possible to install with brew" - commands[0]="sudo chown -R $USER $brew_idir && chmod -R u+w $brew_idir" - possible_solutions "${commands[@]}" - fi - fi - if [ ! -w ]; then - echo -e "\033[43,33;5mWARNING: \033[0m $brew_idir is not writtable, brew require that $brew_idir is writtable and $brew_idir/bin is in your PATH, otherwise it will be not possible to install with brew" - sleep 10 fi echo echo 'Package Manager: '"$discover_package_manager_ret" return diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp index c3a061d3f6f84c62daed9334927bbb18eb73f463..2a11a37972a2507c3418370018e102d28b92811f 100644 --- a/src/Grid/grid_dist_id_comm.hpp +++ b/src/Grid/grid_dist_id_comm.hpp @@ -137,7 +137,7 @@ template<unsigned int dim, typename St, typename T, typename Decomposition = Car class grid_dist_id_comm { //! VCluster - Vcluster<> & v_cl; + Vcluster<Memory> & v_cl; //! Maps the processor id with the communication request into map procedure openfpm::vector<size_t> p_map_req; @@ -1108,14 +1108,14 @@ public: queue_recv_data_get<prp_object>(eg_box,prp_recv,prRecv_prp); for (size_t i = 0 ; i < loc_grid.size() ; i++) - {loc_grid.get(i).removeCopyReset();} + {loc_grid.get(i).removeAddUnpackReset();} ghost_get_local<prp...>(loc_ig_box,loc_eg_box,gdb_ext,loc_grid,g_id_to_external_ghost_box,ginfo,use_bx_def); merge_received_data_get<prp ...>(loc_grid,eg_box,prp_recv,prRecv_prp,g_id_to_external_ghost_box,eb_gid_list,opt); for (size_t i = 0 ; i < loc_grid.size() ; i++) - {loc_grid.get(i).removeCopyFinalize(v_cl.getmgpuContext());} + {loc_grid.get(i).template removeAddUnpackFinalize<prp ...>(v_cl.getmgpuContext());} } /*! \brief It merge the information in the ghost with the @@ -1249,7 +1249,7 @@ public: * */ grid_dist_id_comm() - :v_cl(create_vcluster()) + :v_cl(create_vcluster<Memory>()) { } diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu index 3ef536e5b9438b34bfcd161d915d78bde45f5224..eb1ecd75206570df67df8af9c2e87327ebaaf9f9 100644 --- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu +++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu @@ -220,9 +220,12 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get ) gdist.template flush<smax_<0>>(flush_type::FLUSH_ON_DEVICE); gdist.template deviceToHost<0>(); -// gdist.write("broken"); + gdist.write("before_ghost"); -// gdist.template ghost_get<0>(RUN_ON_DEVICE); + gdist.template ghost_get<0>(RUN_ON_DEVICE); + + gdist.template deviceToHost<0>(); + gdist.write("after_ghost"); }