diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7aea218cc196fc229b2a562824dc4ea6667ec3a0..e3bc64a9708191b1d38fd9c2e563d294a5942c98 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -74,8 +74,6 @@ ubuntu_run: - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 4 pdata 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 5 pdata 0 $CI_COMMIT_REF_NAME - cd openfpm_numerics - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics diff --git a/.gitmodules b/.gitmodules index 6b30b305dd992cbc0eae8c5c1da296699017a1a4..aa6b50bf8fad1606872c2249313fc6dc2b9de64a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "openfpm_numerics"] path = openfpm_numerics url = ssh://git@git.mpi-cbg.de/openfpm/openfpm_numerics.git +[submodule "gdbgui"] + path = gdbgui + url = https://github.com/incardon/gdbgui.git diff --git a/CHANGELOG.md b/CHANGELOG.md index afe6649ba91f03cf05853a9882aa24973044aeea..d5ed2e9d81077b8c78277cc4964c9ba659354d93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,38 @@ # Change Log All notable changes to this project will be documented in this file. -## [3.1.0] 2020 (Codename New Horizons) +## [3.3.0] April 2021 (Codename Vega) + +- Adding support for HIP and AMD GPU. (Only particles) 1_gpu_first/7_sph_dlb_gpu/7_sph_dlb_gpu_opt are compatible with HIP +Additional Notes: +- WARNING: AMD GPUs are tested manually and not in CI. This mean that out this release stuff can break at least + until I do not convince my working place to buy one for me ... and is gonna be hard because or rule here and there ... or who + is reading this message does not want to buy one for me :-) +- SparseGridGPU are unsupported untill AMD does not fix the bug reported here: + https://github.com/ROCm-Developer-Tools/HIP/issues/2260 + +### Changes + +- None + +### Fixed + +- uninitialized variables in the SPH example on GPU, and other fixes necessary for AMD gpus + +## [3.2.0] January 2021 (Codename Hopper) + +- Adding CUDA_ON_CPU option to run CUDA code on CPU +- Adding gdb-gui debugger + +### Fixed + +- Minors bugs + +### Changes + +- In order to compile OpenFPM is now required a compiler implementing C++14 Standard + +## [3.1.0] October 2020 (Codename New Horizons) - Adding GPU support for ghost_put - Adding support for CUDA 11 @@ -14,7 +45,7 @@ All notable changes to this project will be documented in this file. - None -## [3.0.0] 2020 (Codename Sparsity) +## [3.0.0] July 2020 (Codename Sparsity) - Upgrading all the dependencies: BOOST,PETSC,SUITESPARSE,OPENBLAS - Adding CPU and GPU sparse grids. Look at the examples SparseGrid in the forlder examples diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b1e858bc0b1f888af6c5602cb05a35ce76d0977..3ee76a524b1a1826eb1655d02f2df73ceca763e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,8 @@ if (POLICY CMP0074) cmake_policy(SET CMP0074 NEW) endif() +set(openfpm_VERSION 3.3.0) + list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/) set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST") @@ -35,7 +37,23 @@ set(METIS_DIR ${METIS_ROOT}) set(PARMETIS_DIR ${PARMETIS_ROOT}) set(OPENBLAS_ROOT CACHE PATH "Root path for blas library") set(SuiteSparse_ROOT ${SUITESPARSE_ROOT}) - +set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap") +set(CPACK_RUN_INSTALL_DEPENDENCIES CACHE BOOL "Set to true if we are creating deb or RPM packages") +set(ENABLE_GARBAGE_INJECTOR CACHE BOOL "Enable the injector of garbage in the memory allocator") +set(ENABLE_VCLUSTER_GARBAGE_INJECTOR CACHE BOOL "Enable the injector of garbage in the vcluster memory buffers") +set(HIP_ENABLE CACHE BOOL "Enable HIP compiler") +set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels") + +# Enabling real GPU is stronger than using CUDA_ON_CPU +if (ENABLE_GPU) + set(CUDA_ON_CPU OFF) + # Test CLang + if (NOT HIP_ENABLE) + find_package(CUDA) + set(CMAKE_CUDA_COMPILER_LIBRARY_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + enable_language(CUDA) + endif() +endif() set (CMAKE_CXX_STANDARD 14) set (CMAKE_CUDA_STANDARD 14) @@ -51,50 +69,45 @@ endif() set(ENV{PATH} "$ENV{PATH}:${HDF5_ROOT}/bin") set(HDF5_PREFER_PARALLEL TRUE) -if(ENABLE_GPU) - enable_language(CUDA) - find_package(CUDA) - - if (CUDA_VERSION_MAJOR EQUAL 9 AND CUDA_VERSION_MINOR EQUAL 2) - message("CUDA is compatible 9.2") - set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=611 --diag_suppress=2885 --diag_suppress=2886 --diag_suppress=2887 --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111" --expt-extended-lambda) - FILE(WRITE cuda_options " -Xcudafe \"--display_error_number --diag_suppress=611 --diag_suppress=2885 --diag_suppress=2886 --diag_suppress=2887 --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111\" --expt-extended-lambda ") - elseif ( CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR EQUAL 1 ) - message("CUDA is compatible 10.1") - set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=2915 --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 " --expt-extended-lambda ) - FILE(WRITE cuda_options "-Xcudafe \"--display_error_number --diag_suppress=2915 --diag_suppress=2914 --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 \" --expt-extended-lambda") - elseif ( CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR EQUAL 2 ) - message("CUDA is compatible 10.2") - set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=2976 --diag_suppress=2977 --diag_suppress=2978 --diag_suppress=2979 --diag_suppress=1835 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128" --expt-extended-lambda) - set(WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT "-Xcudafe \"--display_error_number --diag_suppress=2976 --diag_suppress=2977 --diag_suppress=2978 --diag_suppress=2979 --diag_suppress=1835 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda") - FILE(WRITE cuda_options "-Xcudafe \"--display_error_number --diag_suppress=2976 --diag_suppress=2977 --diag_suppress=2978 --diag_suppress=2979 --diag_suppress=1835 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda") - elseif ( CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 0 ) - message("CUDA is compatible 11.0") - set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=3056 --diag_suppress=3057 --diag_suppress=3058 --diag_suppress=3059 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128" --expt-extended-lambda) - set(WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT "-Xcudafe \"--display_error_number --diag_suppress=3056 --diag_suppress=3057 --diag_suppress=3058 --diag_suppress=3059 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda") - FILE(WRITE cuda_options "-Xcudafe \"--display_error_number --diag_suppress=3056 --diag_suppress=3058 --diag_suppress=3058 --diag_suppress=3059 --diag_suppress=611 --diag_suppress=186 --diag_suppress=128\" --expt-extended-lambda") - else() - message(FATAL_ERROR "CUDA is incompatible, version 9.2 10.1 10.2 and 11.0 is only supported") - endif() -endif() - set(Vc_DIR "${Vc_ROOT}/lib/cmake/Vc/") message("Searching Vc in ${Vc_DIR}") -find_package(Boost 1.68.0 COMPONENTS unit_test_framework iostreams program_options system filesystem) +find_package(Boost 1.72.0 COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context) find_package(MPI) find_package(PETSc) find_package(HDF5) find_package(Eigen3) find_package(LibHilbert) -find_package(Metis) -find_package(ParMetis) +find_package(METIS) +find_package(ParMETIS) find_package(TinyObjLoader ) find_package(BLAS) find_package(LAPACK) find_package(Eigen3) find_package(SuiteSparse OPTIONAL_COMPONENTS UMFPACK) find_package(Vc) +find_package(OpenMP) +find_package(HIP) + +set(CMAKE_SKIP_BUILD_RPATH TRUE) + +if(HIP_FOUND) + set(DEFINE_HIP_GPU "#define HIP_GPU") + set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP") + file(WRITE hip_enabled 1) +else() + file(WRITE hip_enabled 0) +endif() + +if(HIP_FOUND) + set(DEFINE_CUDA_GPU "#define CUDA_GPU") +endif() + +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +endif() if(PROFILE_WITH_SCOREP) set(CMAKE_CXX_COMPILER_LAUNCHER "scorep") @@ -102,7 +115,8 @@ if(PROFILE_WITH_SCOREP) set(CMAKE_CUDA_COMPILER_LAUNCHER "scorep") endif() -if(CUDA_FOUND) + +if(ENABLE_GPU AND (CUDA_FOUND OR HIP_FOUND)) set(OPENFPM_INIT_FILE "initialize/initialize_wrapper_cuda.cu") else() set(OPENFPM_INIT_FILE "initialize/initialize_wrapper_cpu.cpp") @@ -138,10 +152,6 @@ if(TEST_PERFORMANCE) set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST") endif() -if(CUDA_FOUND) - set(DEFINE_CUDA_GPU "#define CUDA_GPU") -endif() - if (METIS_FOUND) set(DEFINE_HAVE_METIS "#define HAVE_METIS 1") else() @@ -156,7 +166,11 @@ else() message( FATAL_ERROR "ParMetis is required in order to install OpenFPM") endif() + if(MPI_FOUND) + get_filename_component(OPENFPM_MPI_DEP "${MPI_C_INCLUDE_DIRS}" DIRECTORY) + file(READ ${CMAKE_SOURCE_DIR}/src/cmake/openfpmConfig-configure.cmake CMAKE_OPENFPM_CONFIG_VARS) + file(WRITE ${CMAKE_SOURCE_DIR}/src/cmake/openfpmConfigVars-configure.cmake "${CMAKE_OPENFPM_CONFIG_VARS}\nset(CMAKE_PREFIX_PATH ${OPENFPM_MPI_DEP}/)") set(DEFINE_HAVE_MPI "#define HAVE_MPI") else() file(WRITE error_code "200") @@ -164,15 +178,35 @@ else() endif() if (Boost_FOUND) - set(DEFINE_HAVE_BOOST "#define HAVE_BOOST") - set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS") - set(DEFINE_HAVE_BOOST_PROGRAM_OPTIONS "#define HAVE_BOOST_PROGRAM_OPTIONS") - set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK "#define HAVE_BOOST_UNIT_TEST_FRAMEWORK") + set(DEFINE_HAVE_BOOST "#define HAVE_BOOST") + set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS") + set(DEFINE_HAVE_BOOST_PROGRAM_OPTIONS "#define HAVE_BOOST_PROGRAM_OPTIONS") + set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK "#define HAVE_BOOST_UNIT_TEST_FRAMEWORK") + if (Boost_CONTEXT_FOUND) + set(DEFINE_HAVE_BOOST_CONTEXT "#define HAVE_BOOST_CONTEXT") + set(OPTIONAL_BOOST_LIBS "-lboost_context") + else() + #if context is not there CUDA_ON_CPU cannot be activated + set(CUDA_ON_CPU OFF) + endif() + if (Boost_FIBER_FOUND) + set(DEFINE_HAVE_BOOST_FIBER "#define HAVE_BOOST_FIBER") + string(CONCAT OPTIONAL_BOOST_LIBS ${OPTIONAL_BOOST_LIBS} " -lboost_fiber") + endif() + file(WRITE optional_boost_libs "${OPTIONAL_BOOST_LIBS}") else() file(WRITE error_code "202") message( FATAL_ERROR "BOOST is required in order to install OpenFPM" ) endif() +if(ENABLE_GPU AND CUDA_FOUND) + set(DEFINE_CUDA_GPU "#define CUDA_GPU") +endif() + +if(CUDA_ON_CPU) + set(DEFINE_CUDA_GPU "#define CUDA_GPU") +endif() + if(HDF5_FOUND) if (HDF5_IS_PARALLEL) set(DEFINE_HAVE_HDF5 "#define HAVE_HDF5") @@ -210,6 +244,14 @@ else() message( FATAL_ERROR "Vc is required in roder to install OpenFPM") endif() +if (ENABLE_GARBAGE_INJECTOR) + set(DEFINE_GARBAGE_INJECTOR "#define GARBAGE_INJECTOR") +endif() + +if (ENABLE_VCLUSTER_GARBAGE_INJECTOR) + set(DEFINE_VCLUSTER_GARBAGE_INJECTOR "#define VCLUSTER_GARBAGE_INJECTOR") +endif() + if(APPLE) set(DEFINE_HAVE_OSX "#define HAVE_OSX") endif() @@ -220,17 +262,109 @@ endif() file(WRITE error_code "0") file(WRITE cuda_lib "${CUDA_cudart_static_LIBRARY} ${CUDA_cudadevrt_LIBRARY}") -file(WRITE cuda_include "-I${CUDA_INCLUDE_DIRS}") +if(CUDA_ON_CPU) + file(WRITE cuda_include "-I${CUDA_INCLUDE_DIRS} -D__NVCC__ -DCUDART_VERSION=11000 -DCUDA_ON_CPU") + file(WRITE cuda_on_cpu "YES") +else() + file(WRITE cuda_include "-I${CUDA_INCLUDE_DIRS}") + file(WRITE cuda_on_cpu "NO") +endif() file(WRITE mpi_include "-I${MPI_C_INCLUDE_DIRS}") file(WRITE mpi_libs "${MPI_C_LINK_FLAGS} ${MPI_C_LIBRARIES}") -file(WRITE cuda_options "${WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT}") configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/config/config_cmake.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/config/config.h) -add_subdirectory (src) -add_subdirectory (openfpm_devices) -add_subdirectory (openfpm_data) add_subdirectory (openfpm_io) -add_subdirectory (openfpm_vcluster) add_subdirectory (openfpm_numerics) +file(WRITE cuda_options "${WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT}") + +add_subdirectory (src) + +#################### CPack to create auto installing packages + +include(InstallRequiredSystemLibraries) + +string(REPLACE "." ";" VERSION_LIST ${openfpm_VERSION}) +list(GET VERSION_LIST 0 OPENFPM_VERSION_MAJOR) +list(GET VERSION_LIST 1 OPENFPM_VERSION_MINOR) +list(GET VERSION_LIST 2 OPENFPM_VERSION_PATCH) + +if (CPACK_RUN_INSTALL_DEPENDENCIES) + + set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenFPM distributed data-structures") + set(CPACK_PACKAGE_VENDOR "IBirdSoft") + set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.txt") + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/gpl-3.0.txt") + set(CPACK_PACKAGE_VERSION_MAJOR ${OPENFPM_VERSION_MAJOR}) + set(CPACK_PACKAGE_VERSION_MINOR ${OPENFPM_VERSION_MINOR}) + set(CPACK_PACKAGE_VERSION_PATCH ${OPENFPM_VERSION_PATCH}) + set(CPACK_PACKAGE_INSTALL_DIRECTORY "CMake ${CMake_VERSION_MAJOR}.${CMake_VERSION_MINOR}") + set(CPACK_PACKAGE_INSTALL_DIRECTORY /usr/local/openfpm) + set(CPACK_PACKAGING_INSTALL_PREFIX /usr/local/openfpm) + set(CPACK_PACKAGE_HOMEPAGE_URL http://openfpm.mpi-cbg.de) + set(CPACK_RPM_PACKAGE_AUTOREQPROV NO) + set(CPACK_PACKAGE_CONTACT incardon@mpi-cbg.de) + set(CPACK_DEBIAN_PACKAGE_MAINTAINER Pietro Incardona) + set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/scripts/postinst) + set(CPACK_POSTFLIGHT_OPENFPM_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/scripts/postflight) + set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/src/script/postinst") + set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.txt") + + install(FILES $ENV{DEP_PACKING}/openfpm_vars + DESTINATION source + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/BOOST + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/EIGEN + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/HDF5 + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/LIBHILBERT + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/METIS + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/MPI + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/OPENBLAS + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/PARMETIS + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/PETSC + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/SUITESPARSE + DESTINATION dependencies/ + COMPONENT OpenFPM) + + install(DIRECTORY $ENV{DEP_PACKING}/VCDEVEL + DESTINATION dependencies/ + COMPONENT OpenFPM) + +endif() + +include(CPack) + +cpack_add_component(OpenFPM + DISPLAY_NAME OpenFPM + DESCRITION OpenFPM binary files) + + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..01b4977e08a09a9040e8108b10aad9f4809eab31 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +all: + $(MAKE) -C build $@ + +clean: + $(MAKE) -C build $@ + +install: + $(MAKE) -C build $@ + script/install_parallel_debugger + +pdata: + $(MAKE) -C build $@ + +numerics: + $(MAKE) -C build $@ + +.PHONY: all clean install diff --git a/README.txt b/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d10c8605e4414c054827bfb5d7adc3200dcd397 --- /dev/null +++ b/README.txt @@ -0,0 +1,25 @@ +# OpenFPM + +OpenFPM is a scalable and open C++ framework for particles and mesh simulation + +You can build scalable: Molecular dynamic,SPH, Granular flow,Finite differences, Particle Mesh codes on CPUs and GPUs. We provide examples for each case here. Despite our main interest in simulations the distributed data-structures OpenFPM provides are not limited to simulations. The work is released under GPL 3.0 + + +If you use OpenFPM please cite this paper + +https://www.sciencedirect.com/science/article/pii/S0010465519300852?via%3Dihub + +## Installation + +To install run ./install and follow instructions + +At the end of the installation rememerb to to make install + +# Example and documentation + +Example and documentation can be founded at openfpm.mpi-cbg.de + + + + + diff --git a/build.sh b/build.sh index f565a95193ec51969c7adf67c7c81c63e9b83910..f26c0a6d1ccad1aae8b45701d2cdb31d514bbd20 100755 --- a/build.sh +++ b/build.sh @@ -30,6 +30,7 @@ if [ x"$hostname" == x"cifarm-ubuntu-node" ]; then # rm -rf $HOME/openfpm_dependencies/openfpm_pdata/$branch/ echo "Ubuntu node" ./install_MPI_mpich.sh $HOME/openfpm_dependencies/openfpm_pdata/$branch/ 4 + export PATH="/opt/bin:$PATH" fi if [ x"$hostname" == x"cifarm-mac-node.mpi-cbg.de" ]; then @@ -44,6 +45,9 @@ if [ x"$hostname" == x"falcon1" ]; then if [ x"$comp_type" == x"intel" ]; then module load parallel_studio_xe/2019u1 dependency_dir=/projects/ppm/rundeck/openfpm_dependencies_intel/ + elif [ x"$with_gpu" == x"" ]; then + mkdir /projects/ppm/rundeck/openfpm_dependencies_${branch}_no_cuda/ + dependency_dir=/projects/ppm/rundeck/openfpm_dependencies_${branch}_no_cuda/ else mkdir /projects/ppm/rundeck/openfpm_dependencies_$branch/ dependency_dir=/projects/ppm/rundeck/openfpm_dependencies_$branch/ @@ -72,31 +76,37 @@ mkdir openfpm_numerics/src/config echo "Compiling general" source ~/.bashrc - -installation_dir="--prefix=$HOME/openfpm_install/$branch" + +if [ x"$comp_type" != x"full" ]; then + installation_dir=" " +else + installation_dir="--prefix=$HOME/openfpm_install/$branch" +fi # force ssh to not use HostKey verification #echo "StrictHostKeyChecking=no" > $HOME/.ssh/config #chmod 600 $HOME/.ssh/config -foward_options= +foward_options="--enable-cuda-on-cpu" install_options= if [ x"$comp_type" == x"full" ]; then - install_options="-s" + install_options="-s " elif [ x"$comp_type" == x"intel" ]; then - install_options=" " + install_options="-s " else - install_options="-s -m" + install_options="-s -m " fi if [ x"$comp_type" == x"se_class" ]; then - foward_options="--enable-se-class1 --with-action-on-error=THROW_ON_ERROR" + foward_options="$foward_options --enable-se-class1 --with-action-on-error=THROW_ON_ERROR" +elif [ x"$comp_type" == x"garbageinjv" ]; then + foward_options="$foward_options --enable-garbageinjv" elif [ x"$comp_type" == x"asan" ]; then foward_options="$foward_options --enable-asan" fi echo "Installing with: ./install $gpu_support -i $dependency_dir $install_options -c \"$installation_dir $foward_options \" " -./install $gpu_support -i $dependency_dir $install_options -c "$installation_dir $foward_options " +nice -n 19 ./install $gpu_support -i $dependency_dir $install_options -c "$installation_dir $foward_options " if [ $? -ne 0 ]; then echo "Fail to ./install" exit 1 ; @@ -104,16 +114,24 @@ fi # Check of we have to do a make install if [ x"$comp_type" == x"full" ]; then + mv $HOME/openfpm_vars $HOME/openfpm_vars_$branch make install + if [ x"$?" != x"0" ]; then + exit 1 + fi else echo "Make install partial" - mv $HOME/openfpm_vars $HOME/openfpm_vars_$branch + if [ x"$comp_type" == x"intel" ]; then + mv $HOME/openfpm_vars $HOME/openfpm_vars_intel + else + mv $HOME/openfpm_vars $HOME/openfpm_vars_$branch + fi source $HOME/openfpm_vars_$branch if [ x"$hostname" == x"suitcase" ]; then echo "Running make on 1 cores" make VERBOSE=1 -j 1 else - make VERBOSE=1 -j 8 + nice -n 19 make VERBOSE=1 -j 8 fi fi diff --git a/cmake_modules/FindMetis.cmake b/cmake_modules/FindMETIS.cmake similarity index 100% rename from cmake_modules/FindMetis.cmake rename to cmake_modules/FindMETIS.cmake diff --git a/cmake_modules/FindParMetis.cmake b/cmake_modules/FindParMETIS.cmake similarity index 100% rename from cmake_modules/FindParMetis.cmake rename to cmake_modules/FindParMETIS.cmake diff --git a/configure b/configure index 14c562cfe2797169944c0825802751cda6b033ca..7853c20d07d671ea69ae78490bebac5dea937ed3 100755 --- a/configure +++ b/configure @@ -100,6 +100,7 @@ enable_debug with_metis with_hdf5 with_libhilbert +enable_cuda_on_cpu enable_scan_coverty enable_test_performance enable_test_coverage @@ -107,6 +108,7 @@ with_parmetis enable_se_class1 enable_se_class2 enable_se_class3 +with_alpaka with_action_on_error with_boost with_boost_libdir @@ -120,7 +122,10 @@ with_petsc with_eigen with_vcdevel enable_gpu +enable_hip enable_asan +enable_garbageinj +enable_garbageinjv ' rm -rf build @@ -237,20 +242,34 @@ do scan_coverty) conf_options="$conf_options -DSCAN_COVERTY=ON" ;; + cuda_on_cpu) + conf_options="$conf_options -DCUDA_ON_CPU=ON" + ;; test_performance) conf_options="$conf_options -DTEST_PERFORMANCE=ON" ;; gpu) - if [ x"$CXX" == x"" ]; then + conf_options="$conf_options -DCMAKE_CUDA_HOST_COMPILER=$(which mpic++) " + if [ x"$CXXCUDA" == x"" ]; then conf_options="$conf_options" else - conf_options="$conf_options -DCMAKE_CUDA_HOST_COMPILER=$(which $CXX)" + conf_options="$conf_options -DCMAKE_CUDA_COMPILER=$(which $CXXCUDA)" fi conf_options="$conf_options -DENABLE_GPU=ON" ;; + hip) + conf_options="$conf_options -DHIP_ENABLE=ON -DENABLE_GPU=ON" + enable_hip_conf=1 + ;; asan) conf_options="$conf_options -DENABLE_ASAN=ON" ;; + garbageinj) + conf_options="$conf_options -DENABLE_GARBAGE_INJECTOR=ON" + ;; + garbageinjv) + conf_options="$conf_options -DENABLE_VCLUSTER_GARBAGE_INJECTOR=ON" + ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" ac_unrecognized_sep=', ' ;; @@ -493,6 +512,9 @@ do vcdevel) conf_options="$conf_options -DVc_ROOT=$ac_optarg" ;; + alpaka) + conf_options="$conf_options -DALPAKA_ROOT=$ac_optarg" + ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" ac_unrecognized_sep=', ';; esac @@ -564,7 +586,12 @@ fi cd build ## remove enerything +if [ x"$enable_hip_conf" != x"1" ]; then + conf_options="$conf_options -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++" +fi + echo "Calling cmake ../. $conf_options" +printf "cmake ../. $conf_options" > cmake_build_options rm ../error_code DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. $conf_options if [ $? != 0 ]; then @@ -593,6 +620,7 @@ clean: install: \$(MAKE) -C build \$@ + script/install_parallel_debugger pdata: \$(MAKE) -C build \$@ diff --git a/example/Grid/0_simple/Makefile b/example/Grid/0_simple/Makefile index 26bd7de19ab527d116d6ee67307b07d016111c44..3310e13cd60792dacaa16edd0721e562db1fb3d4 100644 --- a/example/Grid/0_simple/Makefile +++ b/example/Grid/0_simple/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) grid: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ grid: $(OBJ) all: grid run: all - mpirun -np 2 ./grid + mpirun --oversubscribe -np 2 ./grid .PHONY: clean all run diff --git a/example/Grid/1_stencil/Makefile b/example/Grid/1_stencil/Makefile index bb7b0ec2e27f77da6ea3ebccae9f34147e128bf4..94a73917d47e6e112f60c4b6e549929ccaf26e3c 100644 --- a/example/Grid/1_stencil/Makefile +++ b/example/Grid/1_stencil/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) stencil: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ stencil: $(OBJ) all: stencil run: all - mpirun -np 3 ./stencil + mpirun --oversubscribe -np 3 ./stencil .PHONY: clean all run diff --git a/example/Grid/2_solve_eq/Makefile b/example/Grid/2_solve_eq/Makefile index 9f4b7160aaedb1c193c95ca1d4bdf594176e6a97..cab7ab23bacccffa070046aa36d775ec8c0152a4 100644 --- a/example/Grid/2_solve_eq/Makefile +++ b/example/Grid/2_solve_eq/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) periodic: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ periodic: $(OBJ) all: periodic run: all - mpirun -np 4 ./periodic + mpirun --oversubscribe -np 4 ./periodic .PHONY: clean all run diff --git a/example/Grid/3_gray_scott/Makefile b/example/Grid/3_gray_scott/Makefile index 170e428569ba200362dfa7bbc4a5d533f4006696..35f9eb4aff157ab02d66c25c171bf79dc440e209 100644 --- a/example/Grid/3_gray_scott/Makefile +++ b/example/Grid/3_gray_scott/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) gray_scott: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ gray_scott: $(OBJ) all: gray_scott run: all - mpirun -np 4 ./gray_scott + mpirun --oversubscribe -np 4 ./gray_scott .PHONY: clean all run diff --git a/example/Grid/3_gray_scott_3d/Makefile b/example/Grid/3_gray_scott_3d/Makefile index 04db7e20fa78c9a519f254e71d61f27e47df69f8..d27878771713269caf29430989fdd7d1fcd9510b 100644 --- a/example/Grid/3_gray_scott_3d/Makefile +++ b/example/Grid/3_gray_scott_3d/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) gray_scott: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ gray_scott: $(OBJ) all: gray_scott run: all - mpirun -np 4 ./gray_scott + mpirun --oversubscribe -np 4 ./gray_scott .PHONY: clean all run diff --git a/example/Grid/3_gray_scott_3d/main.cpp b/example/Grid/3_gray_scott_3d/main.cpp index a72029fc694b2ae24f3b560e45c1a78d6e305b40..82ea1fd77b2cde05b6954fc8e63d608791104a02 100644 --- a/example/Grid/3_gray_scott_3d/main.cpp +++ b/example/Grid/3_gray_scott_3d/main.cpp @@ -230,7 +230,7 @@ int main(int argc, char* argv[]) // visualization if (i % 500 == 0) { - Old.save("output_" + std::to_string(count)); +// Old.save("output_" + std::to_string(count)); count++; } } diff --git a/example/Grid/3_gray_scott_3d_vectorization/Makefile b/example/Grid/3_gray_scott_3d_vectorization/Makefile index 4993b0c8aed55bfb12cfa516360601b7d1ca1ade..525a4be088c34ac37e0a8f85840b9d86cd46fc39 100644 --- a/example/Grid/3_gray_scott_3d_vectorization/Makefile +++ b/example/Grid/3_gray_scott_3d_vectorization/Makefile @@ -10,7 +10,7 @@ OBJ = main.o update_new.o mpif90 -ffree-line-length-none -fno-range-check -fno-second-underscore -fimplicit-none -mavx -O3 -c -g -o $@ $< %.o: %.cpp - $(CC) -O3 -mavx -g -c --std=c++11 -Wno-ignored-attributes -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -mavx -g -c --std=c++14 -Wno-ignored-attributes -o $@ $< $(INCLUDE_PATH) gray_scott: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -18,7 +18,7 @@ gray_scott: $(OBJ) all: gray_scott run: all - mpirun -np 4 ./gray_scott + mpirun --oversubscribe -np 4 ./gray_scott .PHONY: clean all run diff --git a/example/Numerics/PS-CMA-ES/Makefile b/example/Numerics/PS-CMA-ES/Makefile index 73ca664f745ea72ea51936a457c3afc17a572bb0..9771a523c8c76fc374491bcf899cb8c2b2d1e47d 100644 --- a/example/Numerics/PS-CMA-ES/Makefile +++ b/example/Numerics/PS-CMA-ES/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -I/usr/local/cuda/include -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -I/usr/local/cuda/include -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) ps_cma_es: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ ps_cma_es: $(OBJ) all: ps_cma_es run: all - mpirun -np 2 ./ps_cma_es + mpirun --oversubscribe -np 2 ./ps_cma_es .PHONY: clean all run diff --git a/example/Numerics/PSE/0_Derivative_approx_1D/Makefile b/example/Numerics/PSE/0_Derivative_approx_1D/Makefile index da76a97b081f678675d3c9ca416664098dcdd671..a0931408a15e36353f4d6e078c4ae0188eeb672d 100644 --- a/example/Numerics/PSE/0_Derivative_approx_1D/Makefile +++ b/example/Numerics/PSE/0_Derivative_approx_1D/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) pse_1d: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) diff --git a/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile b/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile index a89794a539aa80a4703f3d8a138b3eff72242635..b71fe61f2a212a0eaaee58c17be6abb23467dac2 100644 --- a/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile +++ b/example/Numerics/PSE/1_Derivative_approx_1D_mp/Makefile @@ -10,7 +10,7 @@ LDIR = OBJ_128 = main_float128.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) # pse_1d_128: $(OBJ_128) # $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) -lquadmath diff --git a/example/Numerics/PSE/1_Diffusion_1D/Makefile b/example/Numerics/PSE/1_Diffusion_1D/Makefile index 502affc0c3379bdd1e290d47424388bf2aa333ba..a1485efa1ab9e9729f9486c0444075fc9c60cd2c 100644 --- a/example/Numerics/PSE/1_Diffusion_1D/Makefile +++ b/example/Numerics/PSE/1_Diffusion_1D/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) diff_1d: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ diff_1d: $(OBJ) all: diff_1d run: all - mpirun -np 4 ./diff_1d + mpirun --oversubscribe -np 4 ./diff_1d .PHONY: clean all diff --git a/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile b/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile index 1378bd05620783aa79a98e4e354aa19131c86921..7fd1a1819764a5cef54dbd74b751e6fd6b7829d3 100644 --- a/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile +++ b/example/Numerics/Stoke_flow/0_2D_incompressible/Makefile @@ -8,7 +8,7 @@ OBJ_EIGEN = main_eigen.o OBJ_PETSC = main_petsc.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) all: stokes_2d_eigen stokes_2d_petsc @@ -19,7 +19,7 @@ stokes_2d_petsc: $(OBJ_PETSC) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: all - mpirun -np 3 ./stokes_2d_eigen && mpirun -np 3 ./stokes_2d_petsc + mpirun --oversubscribe -np 3 ./stokes_2d_eigen && mpirun --oversubscribe -np 3 ./stokes_2d_petsc .PHONY: clean all diff --git a/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile b/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile index 5186ac565a358b610c6ed1df82c9dd83fb310bd4..0b6058ade2e92cd24fd22fcf7fdec6c3fa528d4e 100644 --- a/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile +++ b/example/Numerics/Stoke_flow/1_3D_incompressible/Makefile @@ -8,7 +8,7 @@ OBJ_EIGEN = main_eigen.o OBJ_PETSC = main_petsc.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) all: stokes_3d_eigen stokes_3d_petsc @@ -19,7 +19,7 @@ stokes_3d_petsc: $(OBJ_PETSC) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: all - mpirun -np 3 ./stokes_3d_eigen && mpirun -np 3 ./stokes_3d_petsc + mpirun --oversubscribe -np 3 ./stokes_3d_eigen && mpirun --oversubscribe -np 3 ./stokes_3d_petsc .PHONY: clean all run diff --git a/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile index 495aa0d896a024280e1d64e7bff9eb0681ba0c6e..78f653991dbbe9895941ed52ff7dc9b1e87fcfb0 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile +++ b/example/Numerics/Sussman_redistancing/example_sussman_circle/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) example_sussman_circle: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ example_sussman_circle: $(OBJ) all: example_sussman_circle run: all - mpirun -np 2 ./example_sussman_circle + mpirun --oversubscribe -np 2 ./example_sussman_circle .PHONY: clean all run diff --git a/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp index 5ae63429601373b5f6888592f0187fd08fe999e4..b19546737a5117277bb1417dd2de5128db88093e 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_circle/main.cpp @@ -76,7 +76,6 @@ * Once we have received the Phi_SDF from the redistancing, particles can be placed on narrow band around the interface. * * * Creates filled 2D circle with -1/+1 indicator function - * * Runs gaussian filter if gradient at interface too steep or if user sets redist_options.sigma >= 1 * * Runs Sussman redistancing (see @ref RedistancingSussman.hpp) * * Places particles on narrow band around interface * @@ -226,10 +225,6 @@ int main(int argc, char* argv[]) * For the redistancing, we can choose some options. These options will then be passed bundled as a structure to * the redistancing function. Setting these options is optional, since they all have a Default value as well. In * particular the following options can be set by the user: - * * \p sigma: Sigma of the gaussian kernel, which is used for gaussian smooting Phi_0. If the initial gradient of - * phi_0 at the interface is too large and no sigma is chosen or chosen too small, gauss smoothing will - * automatically be applied until phi gradient magnitude <= 12, regardless of which sigma is chosen by - * the user. Default = 0. * * \p min_iter: Minimum number of iterations before steady state in narrow band will be checked (Default: 100). * * \p max_iter: Maximum number of iterations you want to run the redistancing, even if steady state might not yet * have been reached (Default: 1e6). @@ -260,7 +255,6 @@ int main(int argc, char* argv[]) // Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1. // For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options. Redist_options redist_options; - redist_options.sigma = 0; redist_options.min_iter = 100; redist_options.max_iter = 10000; @@ -330,7 +324,8 @@ int main(int argc, char* argv[]) // the magnitude of the gradient typedef aggregate<double, double[grid_dim], double> props_nb; typedef vector_dist<grid_dim, double, props_nb> vd_type; - vd_type vd_narrow_band(0, box, bc, ghost); + Ghost<grid_dim, double> ghost_vd(0); + vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); //! @cond [Initialize narrow band] @endcond diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile index 54c26c019feace480d5d52e1b9c0ab6c45a9b0b2..4ad44cdf7ade19319cdd041820ea386d5a9a2590 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile +++ b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) example_sussman_images: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ example_sussman_images: $(OBJ) all: example_sussman_images run: all - mpirun -np 2 ./example_sussman_images + mpirun --oversubscribe -np 2 ./example_sussman_images .PHONY: clean all run diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp index 18574a4b329457ebfa2d43352b6e392f57a7e04e..70668cee7273fe42abc0272232813a798dd1e025 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp @@ -197,7 +197,6 @@ int main(int argc, char* argv[]) // For the initial re-distancing we use the Sussman method // 1.) Set some redistancing options Redist_options redist_options; - redist_options.sigma = 0; // if the initial gradient of phi at the interface is too large, gauss smoothing will automatically be applied until phi gradient magnitude <= 12, regardless of the given sigma in the main redist_options.min_iter = 100; // min. number of iterations before steady state in narrow band will be checked (default: 100) redist_options.max_iter = 10000; // max. number of iterations you want to run the // redistancing, even if steady state might not yet have been reached (default: 1e6) @@ -229,7 +228,8 @@ int main(int argc, char* argv[]) // the magnitude of the gradient typedef aggregate<double, double[grid_dim], double> props_nb; typedef vector_dist<grid_dim, double, props_nb> vd_type; - vd_type vd_narrow_band(0, box, bc, ghost); + Ghost<grid_dim, double> ghost_vd(0); + vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile index 54c26c019feace480d5d52e1b9c0ab6c45a9b0b2..4ad44cdf7ade19319cdd041820ea386d5a9a2590 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile +++ b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) example_sussman_images: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ example_sussman_images: $(OBJ) all: example_sussman_images run: all - mpirun -np 2 ./example_sussman_images + mpirun --oversubscribe -np 2 ./example_sussman_images .PHONY: clean all run diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp index 7e229fb9513b4086da19f6124c240aba75de44fb..fcd1cbdd1aefa4fab0c24cdcb76b14b382948364 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp @@ -191,7 +191,6 @@ int main(int argc, char* argv[]) // For the initial re-distancing we use the Sussman method // 1.) Set some redistancing options Redist_options redist_options; - redist_options.sigma = 0; // if the initial gradient of phi at the interface is too large, gauss smoothing will automatically be applied until phi gradient magnitude <= 12, regardless of the given sigma in the main redist_options.min_iter = 100; // min. number of iterations before steady state in narrow band will be checked (default: 100) redist_options.max_iter = 10000; // max. number of iterations you want to run the // redistancing, even if steady state might not yet @@ -225,8 +224,9 @@ int main(int argc, char* argv[]) // the magnitude of the gradient typedef aggregate<double, double[grid_dim], double> props_nb; typedef vector_dist<grid_dim, double, props_nb> vd_type; - vd_type vd_narrow_band(0, box, bc, ghost); - vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); + Ghost<grid_dim, double> ghost_vd(0); + vd_type vd_narrow_band(0, box, bc, ghost_vd); + vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class diff --git a/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile b/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile index 714fdcb07f0f4ba0c2b68c3afa616f7637b5846f..eb52eba53cd7592ebc8cc29eb5d6a2b0591e50b4 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile +++ b/example/Numerics/Sussman_redistancing/example_sussman_sphere/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) example_sussman_sphere: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ example_sussman_sphere: $(OBJ) all: example_sussman_sphere run: all - mpirun -np 2 ./example_sussman_sphere + mpirun --oversubscribe -np 2 ./example_sussman_sphere .PHONY: clean all run diff --git a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp index c3286722a15f8f24667e3595029190922e55b308..392f9dc36abd5b3e5d690bd90ff40186d7381c61 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp @@ -35,7 +35,6 @@ * Once we have received the Phi_SDF from the redistancing, particles can be placed on narrow band around the interface. * * * Creates filled 3D sphere with -1/+1 indicator function - * * Runs gaussian filter if gradient at interface too steep or if user sets redist_options.sigma >= 1 * * Runs Sussman redistancing (see @ref RedistancingSussman.hpp) * * Places particles on narrow band around interface * @@ -187,10 +186,6 @@ int main(int argc, char* argv[]) * For the redistancing, we can choose some options. These options will then be passed bundled as a structure to * the redistancing function. Setting these options is optional, since they all have a Default value as well. In * particular the following options can be set by the user: - * * \p sigma: Sigma of the gaussian kernel, which is used for gaussian smooting Phi_0. If the initial gradient of - * phi_0 at the interface is too large and no sigma is chosen or chosen too small, gauss smoothing will - * automatically be applied until phi gradient magnitude <= 12, regardless of which sigma is chosen by - * the user. Default = 0. * * \p min_iter: Minimum number of iterations before steady state in narrow band will be checked (Default: 100). * * \p max_iter: Maximum number of iterations you want to run the redistancing, even if steady state might not yet * have been reached (Default: 1e6). @@ -221,9 +216,8 @@ int main(int argc, char* argv[]) // Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1. // For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options. Redist_options redist_options; - redist_options.sigma = 0; redist_options.min_iter = 100; - redist_options.max_iter = 10000; + redist_options.max_iter = 1000; redist_options.convTolChange.value = 1e-12; redist_options.convTolChange.check = true; @@ -290,7 +284,8 @@ int main(int argc, char* argv[]) // the magnitude of the gradient typedef aggregate<double, double[grid_dim], double> props_nb; typedef vector_dist<grid_dim, double, props_nb> vd_type; - vd_type vd_narrow_band(0, box, bc, ghost); + Ghost<grid_dim, double> ghost_vd(0); + vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); //! @cond [Initialize narrow band] @endcond diff --git a/example/Numerics/Vortex_in_cell/Makefile b/example/Numerics/Vortex_in_cell/Makefile index 0bebd2fc1a1d03617988ffba29ffec48ef2f0255..109cf30c80190734366c589cdfa43b78a48e239b 100644 --- a/example/Numerics/Vortex_in_cell/Makefile +++ b/example/Numerics/Vortex_in_cell/Makefile @@ -14,7 +14,7 @@ vic_petsc_test: OPT += -DTEST_RUN vic_petsc_test: vic_petsc vic_petsc_opt %.o: %.cpp - $(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) vic_petsc_opt: $(OBJ_VIC_PETSC_OPT) $(CC) -o $@ $^ $(LIBS_PATH) $(LIBS) @@ -23,7 +23,7 @@ vic_petsc: $(OBJ_VIC_PETSC) $(CC) -o $@ $^ $(LIBS_PATH) $(LIBS) run: vic_petsc_test - mpirun -np 4 ./vic_petsc && mpirun -np 4 ./vic_petsc_opt + mpirun --oversubscribe -np 4 ./vic_petsc && mpirun --oversubscribe -np 4 ./vic_petsc_opt .PHONY: clean all diff --git a/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp b/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp index 5c4d29a81579e50baa687071f2d78f281ba18cbd..0eb8c9d43ab591f2ce1eee4304ed1090f279edc5 100644 --- a/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp +++ b/example/Numerics/Vortex_in_cell/main_vic_petsc_opt.cpp @@ -505,9 +505,9 @@ template<typename grid> void calc_rhs(grid & g_vort, grid & g_vel, grid & g_dwp) // calculate several pre-factors for the stencil finite // difference - float fac1 = 1.0f*nu/(g_vort.spacing(0)*g_vort.spacing(0)); - float fac2 = 1.0f*nu/(g_vort.spacing(1)*g_vort.spacing(1)); - float fac3 = 1.0f*nu/(g_vort.spacing(2)*g_vort.spacing(2)); + float fac1 = 2.0f*nu/(g_vort.spacing(0)*g_vort.spacing(0)); + float fac2 = 2.0f*nu/(g_vort.spacing(1)*g_vort.spacing(1)); + float fac3 = 2.0f*nu/(g_vort.spacing(2)*g_vort.spacing(2)); float fac4 = 0.5f/(g_vort.spacing(0)); float fac5 = 0.5f/(g_vort.spacing(1)); diff --git a/example/Plot/0_simple_graph/Makefile b/example/Plot/0_simple_graph/Makefile index e2885ae2a21a84bd8bb036284446869a47318c85..e8f9e113e526ff1b5110d2551599850a6c6b6417 100644 --- a/example/Plot/0_simple_graph/Makefile +++ b/example/Plot/0_simple_graph/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -g3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) plot: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) diff --git a/example/SparseGrid/1_gray_scott_3d_sparse/Makefile b/example/SparseGrid/1_gray_scott_3d_sparse/Makefile index 192287e8e9fad169168e1b4d50d20ea0e1c66ede..f1b36f4371e58cf581469ab6a349aa96f2a45b23 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse/Makefile +++ b/example/SparseGrid/1_gray_scott_3d_sparse/Makefile @@ -9,7 +9,7 @@ gray_scott_sparse_test: OPT += -DTEST_RUN gray_scott_sparse_test: gray_scott_sparse %.o: %.cpp - $(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH) gray_scott_sparse: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -17,7 +17,7 @@ gray_scott_sparse: $(OBJ) all: gray_scott_sparse run: gray_scott_sparse_test - mpirun -np 4 ./gray_scott_sparse + mpirun --oversubscribe -np 4 ./gray_scott_sparse .PHONY: clean all run diff --git a/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp b/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp index 872edcb83abb8fb30a11bde967f3436f6d0585ac..f26892a96a14a064139c9ef7fcae07d755c5f29a 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp +++ b/example/SparseGrid/1_gray_scott_3d_sparse/main.cpp @@ -1,9 +1,10 @@ /*! \page SparseGrid SparseGrid * * \subpage Grid_3_gs_3D_sparse - * \subpage Grid_3_gs_3D_sparse_cs - * \subpage Grid_3_gs_3D_sparse_opt * \subpage Grid_3_gs_3D_sparse_gpu + * \subpage Grid_3_gs_3D_sparse_opt + * \subpage Grid_3_gs_3D_sparse_gpu_opt + * \subpage Grid_3_gs_3D_sparse_cs * \subpage Grid_3_gs_3D_sparse_gpu_cs * */ @@ -14,14 +15,13 @@ /*! * - * \page Grid_3_gs_3D_sparse Gray Scott in 3D using sparse grids + * \page Grid_3_gs_3D_sparse Gray Scott in\subpage Grid_3_gs_3D_sparse_opt 3D using sparse grids * * [TOC] * - * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott_sparse} + * # Solving a gray scott-system in 3D using Sparse grids on GPU optimized # {#e3_gs_gray_scott_sparse_gpu_opt} * - * This example show how to solve a Gray-Scott system in 3D using sparse grids in this case we well use a more - * complex geometry + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu (optimized). The problem is the same as \ref Grid_3_gs_3D * * In figure is the final solution of the problem * @@ -34,16 +34,7 @@ * \see \ref Grid_3_gs_3D * * - * We recall here the main differences between sparse and dense. - * - * * **get** function return now constant values, so cannot be used to get values, a get in write is an insert - * a get on a point position that has not been inserted return the background value - * - * * **insert** function create/overwrite the points value - * - * * **getDomainIterator** return an iterator on the existing points - * - * * **getGridIterator** return an iterator on the dense version of the grid + * * * * diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile index c0c9bee799cd7ff6adb7939c66f76eb09e6495ae..d815f8d56145b1bbe9bb064359549e4943477684 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile +++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/Makefile @@ -3,15 +3,27 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= -ifeq (, $(shell which nvcc)) +ifdef CUDA_ON_CPU CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif + LIBS_SELECT=$(LIBS) endif + gray_scott_sparse_gpu_test: OPT += -DTEST_RUN gray_scott_sparse_gpu_test: gray_scott_sparse_gpu @@ -25,12 +37,12 @@ OBJ = main.o $(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) gray_scott_sparse_gpu: $(OBJ) - $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT) all: gray_scott_sparse_gpu run: gray_scott_sparse_gpu_test - mpirun -np 4 ./gray_scott_sparse_gpu + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu .PHONY: clean all run diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu index 1ccd9d4a9dfa4692e9b868294fdbe21a07289555..794f5bc0815106c18ae19a2a69034a5fed835bd5 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu +++ b/example/SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu @@ -1,4 +1,3 @@ -#include "util/cuda/cuda_launch.hpp" #include "Grid/grid_dist_id.hpp" #include "data_type/aggregate.hpp" #include "timer.hpp" @@ -73,8 +72,12 @@ constexpr int x = 0; constexpr int y = 1; constexpr int z = 2; +//! \cond [grid definition] \endcond + typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float> > SparseGridType; +//! \cond [grid definition] \endcond + void init(SparseGridType & grid, Box<3,float> & domain) { //! \cond [create points] \endcond diff --git a/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7c80e2810f9bc30ccf4df3c880e5d1a58876fca8 --- /dev/null +++ b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/Makefile @@ -0,0 +1,52 @@ +include ../../example.mk + +### internally the example disable with the preprocessor its code if not compiled with nvcc +CUDA_CC= +CUDA_CC_LINK= +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) +else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif + LIBS_SELECT=$(LIBS) +endif + + + +gray_scott_sparse_gpu_test: OPT += -DTEST_RUN +gray_scott_sparse_gpu_test: gray_scott_sparse_gpu + +CC=mpic++ + +LDIR = + +OBJ = main.o + +%.o: %.cu + $(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +gray_scott_sparse_gpu: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT) + +all: gray_scott_sparse_gpu + +run: gray_scott_sparse_gpu_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_gpu + diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/config.cfg b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/config.cfg similarity index 100% rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/config.cfg rename to example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/config.cfg diff --git a/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..116a76a96fd3279ad56a451a33655a3f2d1400e0 --- /dev/null +++ b/example/SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu @@ -0,0 +1,286 @@ +//#define VCLUSTER_PERF_REPORT <------ Activate telemetry for the VCluster data-structure +//#define SYNC_BEFORE_TAKE_TIME <------ Force synchronization of the kernels everytime we take the time with the structure timer. +// Use this option for telemetry and GPU otherwise the result are unreliable +//#define ENABLE_GRID_DIST_ID_PERF_STATS <------ Activate telementry for the grid data-structure + +#include "Decomposition/Distribution/BoxDistribution.hpp" +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_gpu_opt Gray Scott in 3D using sparse grids on GPU (Optimized) + * + * [TOC] + * + * # Solving a gray scott-system in 3D using Sparse grids on gpu (Optimized) # {#e3_gs_gray_scott_gpu} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu + * + * In figure is the final solution of the problem + * + * \htmlonly + * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/> + * \endhtmlonly + * + * More or less this example is the adaptation of the dense example in 3D + * + * \see \ref Grid_3_gs_3D + * + * # Optimizations + * + * Instead of using the default decomposition algorithm based on parmetis we use BoxDistribution. This decomposition divide the space equally + * across processors. The way to use a different algorithm for decomposing the sparse grid is given by changing the type of the Sparse grid + * + * \snippet SparseGrid/2_gray_scott_3d_sparse_gpu_opt/main.cu grid definition + * + * Because the geometry is fixed we are also using the option SKIP_LABELLING. With this option active after a normal ghost_get we are able to + * activate certain optimization patterns in constructions of the sending buffers and merging data. + * + */ + +#ifdef __NVCC__ + +constexpr int U = 0; +constexpr int V = 1; + +constexpr int U_next = 2; +constexpr int V_next = 3; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +//! \cond [grid definition] \endcond + +typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec; + +typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>,CudaMemory, Dec> SparseGridType; + +//! \cond [grid definition] \endcond + +void init(SparseGridType & grid, Box<3,float> & domain) +{ + //! \cond [create points] \endcond + + typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT; + + grid.addPoints([] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + + //! \cond [create points] \endcond + + long int x_start = grid.size(0)*1.55f/domain.getHigh(0); + long int y_start = grid.size(1)*1.55f/domain.getHigh(1); + long int z_start = grid.size(1)*1.55f/domain.getHigh(2); + + long int x_stop = grid.size(0)*1.85f/domain.getHigh(0); + long int y_stop = grid.size(1)*1.85f/domain.getHigh(1); + long int z_stop = grid.size(1)*1.85f/domain.getHigh(2); + + //! \cond [create points sub] \endcond + + grid_key_dx<3> start({x_start,y_start,z_start}); + grid_key_dx<3> stop ({x_stop,y_stop,z_stop}); + + grid.addPoints(start,stop,[] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 0.5; + data.template get<V>() = 0.24; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + + //! \cond [create points sub] \endcond +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {256,256,256}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + float deltaT = 0.25; + + // Diffusion constant for specie U + float du = 2*1e-5; + + // Diffusion constant for specie V + float dv = 1*1e-5; + + // Number of timesteps +#ifdef TEST_RUN + size_t timeSteps = 300; +#else + size_t timeSteps = 15000; +#endif + + // K and F (Physical constant in the equation) + float K = 0.053; + float F = 0.014; + + SparseGridType grid(sz,domain,g,bc); + + // spacing of the grid on x and y + float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain); + + // sync the ghost + grid.template ghost_get<U,V>(RUN_ON_DEVICE); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + float uFactor = deltaT * du/(spacing[x]*spacing[x]); + float vFactor = deltaT * dv/(spacing[x]*spacing[x]); + + auto & v_cl = create_vcluster(); + + timer tot_sim; + tot_sim.start(); + + for (size_t i = 0; i < timeSteps ; ++i) + { + if (v_cl.rank() == 0) + {std::cout << "STEP: " << i << std::endl;} +/* if (i % 300 == 0) + { + std::cout << "STEP: " << i << std::endl; + grid.write_frame("out",i,VTK_WRITER); + }*/ + + //! \cond [stencil get and use] \endcond + + typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType; + + //! \cond [lambda] \endcond + + auto func = [uFactor,vFactor,deltaT,F,K] __device__ (float & u_out, float & v_out, + CpBlockType & u, CpBlockType & v, + int i, int j, int k){ + + float uc = u(i,j,k); + float vc = v(i,j,k); + + u_out = uc + uFactor *(u(i-1,j,k) + u(i+1,j,k) + + u(i,j-1,k) + u(i,j+1,k) + + u(i,j,k-1) + u(i,j,k+1) - 6.0f*uc) - deltaT * uc*vc*vc + - deltaT * F * (uc - 1.0f); + + + v_out = vc + vFactor *(v(i-1,j,k) + v(i+1,j,k) + + v(i,j+1,k) + v(i,j-1,k) + + v(i,j,k-1) + v(i,j,k+1) - 6.0f*vc) + deltaT * uc*vc*vc + - deltaT * (F+K) * vc; + }; + + //! \cond [lambda] \endcond + + //! \cond [body] \endcond + + if (i % 2 == 0) + { + cudaDeviceSynchronize(); + timer tconv; + tconv.start(); + grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + cudaDeviceSynchronize(); + tconv.stop(); + std::cout << "Conv " << tconv.getwct() << std::endl; + + // After copy we synchronize again the ghost part U and V + + grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING); + } + else + { + grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + // After copy we synchronize again the ghost part U and V + grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING); + } + + //! \cond [body] \endcond + + // Every 500 time step we output the configuration for + // visualization +// if (i % 500 == 0) +// { +// grid.save("output_" + std::to_string(count)); +// count++; +// } + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + grid.deviceToHost<U,V,U_next,V_next>(); + grid.write("final"); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d/main.cpp + * + */ +} + +#else + +int main(int argc, char* argv[]) +{ + return 0; +} + +#endif + diff --git a/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile b/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9119f712c01f3220515c3e2d2cc348dabb5489d9 --- /dev/null +++ b/example/SparseGrid/2_gray_scott_3d_sparse_opt/Makefile @@ -0,0 +1,30 @@ +include ../../example.mk + +CC=mpic++ + +LDIR = + +OBJ = main.o +OBJ_FLOAT = main_float.o +gray_scott_sparse_opt_test: OPT += -DTEST_RUN +gray_scott_sparse_opt_test: gray_scott_sparse_opt + +%.o: %.cpp + $(CC) -mavx -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH) + +gray_scott_sparse_opt: $(OBJ) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + +gray_scott_sparse_opt_float: $(OBJ_FLOAT) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + +all: gray_scott_sparse_opt gray_scott_sparse_opt_float + +run: gray_scott_sparse_opt_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_opt + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_opt + diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/config.cfg b/example/SparseGrid/2_gray_scott_3d_sparse_opt/config.cfg similarity index 100% rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/config.cfg rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/config.cfg diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp b/example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp similarity index 92% rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp rename to example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp index 93d90c53f196e65f7e4779c4743ca59209dc612d..6c90b1d81f4a32e22bcebe7e9af8618d987e7a20 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp +++ b/example/SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp @@ -1,5 +1,3 @@ - -#include "util/cuda/cuda_launch.hpp" #include "Grid/grid_dist_id.hpp" #include "data_type/aggregate.hpp" #include "timer.hpp" @@ -28,7 +26,7 @@ * * Two optimization has been done. The first is to change the layout to struct of arrays defining the grid with * - * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp grid definition + * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp grid definition * * The second is using the function **conv_cross2** to calculate the right-hand-side * this function can be used to do a convolution that involve points in a cross stencil like in figure that involve @@ -44,8 +42,8 @@ \endverbatim * - * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use float - * we have to use Vc::float_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is + * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use double + * we have to use Vc::double_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is * now integrated in openfpm. * *\htmlonly @@ -62,11 +60,11 @@ * * The lambda function is defined as * - * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp lambda + * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp lambda * * and used in the body loop * - * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp body + * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp body * * To note that instead of copy we split the properties where we are acting at every iteration * @@ -202,7 +200,7 @@ int main(int argc, char* argv[]) auto func = [uFactor,vFactor,deltaT,F,K](Vc::double_v & u_out,Vc::double_v & v_out, Vc::double_v & u,Vc::double_v & v, - cross_stencil_v & uc,cross_stencil_v & vc, + cross_stencil_v<double> & uc,cross_stencil_v<double> & vc, unsigned char * mask){ u_out = u + uFactor *(uc.xm + uc.xp + @@ -222,7 +220,12 @@ int main(int argc, char* argv[]) if (i % 2 == 0) { + + timer ts; + ts.start(); grid.conv_cross2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + ts.stop(); + std::cout << ts.getwct() << std::endl; // After copy we synchronize again the ghost part U and V grid.ghost_get<U_next,V_next>(); @@ -260,7 +263,7 @@ int main(int argc, char* argv[]) * * Deinitialize the library * - * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp finalize + * \snippet SparseGrid/2_gray_scott_3d_sparse_opt/main.cpp finalize * */ diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile similarity index 76% rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile index 159c4698f378d6109e77fb3c893891262d119be8..8de8ab607e994f1d656eeb1a87898b8f39bf6fcc 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_cs/Makefile +++ b/example/SparseGrid/3_gray_scott_3d_sparse_cs/Makefile @@ -9,7 +9,7 @@ gray_scott_sparse_cs_test: OPT += -DTEST_RUN gray_scott_sparse_cs_test: gray_scott_sparse_cs %.o: %.cpp - $(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH) gray_scott_sparse_cs: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -17,7 +17,7 @@ gray_scott_sparse_cs: $(OBJ) all: gray_scott_sparse_cs run: gray_scott_sparse_cs_test - mpirun -np 4 ./gray_scott_sparse_cs + mpirun --oversubscribe -np 4 ./gray_scott_sparse_cs .PHONY: clean all run diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_cs/config.cfg similarity index 100% rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/config.cfg rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/config.cfg diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp b/example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp similarity index 98% rename from example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp rename to example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp index 9b468421dfece788ba622ea1c56c0335a1f1193f..98444f92f0f165ed6f2a88b5a6d4883f791f19e5 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp +++ b/example/SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp @@ -51,18 +51,18 @@ * The initialization involve the creation of 3 sphere and one cylinder channel connecting them in order to do it we * create an iterator over the grid (inserted and not inserted) point with **getGridIterator** * - * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp init sphere channel + * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp init sphere channel * * After creating the domain we make a perturbation in the up sphere * - * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp perturbation + * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp perturbation * * # Boundary conditions * * For this example we use mirror on direction X Y Z If the point is missing. If the point is missing in both direction than * the second derivative is considered zero * - * \snippet SparseGrid/1_gray_scott_3d_sparse_cs/main.cpp boundary condition + * \snippet SparseGrid/3_gray_scott_3d_sparse_cs/main.cpp boundary condition * */ diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile similarity index 65% rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile index 50373464fbc7429a088540b11eeff1bfa3dd6f29..5fde3c84c1318de6c32db3d79f46e1f5f0fe1809 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/Makefile +++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/Makefile @@ -3,13 +3,19 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + endif endif gray_scott_sparse_gpu_test: OPT += -DTEST_RUN @@ -30,7 +36,7 @@ gray_scott_sparse_gpu: $(OBJ) all: gray_scott_sparse_gpu run: gray_scott_sparse_gpu_test - mpirun -np 4 ./gray_scott_sparse_gpu + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu .PHONY: clean all run diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu similarity index 99% rename from example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu rename to example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu index 529ca24572c707c5469d086d2abaa628170a9d5c..cc621f0bd8e7833f04a9e31d4628c423b6c7c486 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu +++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs/main.cu @@ -83,7 +83,7 @@ void init(sgrid_type & grid, Box<3,double> & domain) vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); - double distance = vp.norm() / sqrt(3); + double distance = vp.norm() / sqrt(3.0f); // Check if the point is in the domain if (sph1.isInside(pc) || sph2.isInside(pc) || sph3.isInside(pc) || (distance < 0.1 && channel_box.isInside(pc)) ) diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..15133370060405ef2c77e59de99de4d9ecd6eac5 --- /dev/null +++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/Makefile @@ -0,0 +1,52 @@ +include ../../example.mk + +### internally the example disable with the preprocessor its code if not compiled with nvcc + +CUDA_CC= +CUDA_CC_LINK= +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) +else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif + LIBS_SELECT=$(LIBS) +endif + + +gray_scott_sparse_gpu_test: OPT += -DTEST_RUN +gray_scott_sparse_gpu_test: gray_scott_sparse_gpu + +CC=mpic++ + +LDIR = + +OBJ = main.o + +%.o: %.cu + $(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +gray_scott_sparse_gpu: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT) + +all: gray_scott_sparse_gpu + +run: gray_scott_sparse_gpu_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_gpu + diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..71e6019c76f15121e03378b86b5a7733db0a84bd --- /dev/null +++ b/example/SparseGrid/3_gray_scott_3d_sparse_gpu_cs_opt/main.cu @@ -0,0 +1,544 @@ +//#define VCLUSTER_PERF_REPORT +//#define SYNC_BEFORE_TAKE_TIME +//#define ENABLE_GRID_DIST_ID_PERF_STATS +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_gpu_cs_opt Gray Scott in 3D using sparse grids on gpu in complex geometry + * + * [TOC] + * + * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu with complex geometry + * + * In figure is the final solution of the problem + * + * \htmlonly +<table border="1" bgcolor="black"> + <tr> + <td> + <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs_section.png" style="width: 500px;" /> + </td> + <td> + <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs.png" style="width: 500px;" /> + </td> + </tr> +</table> +\endhtmlonly + * + * More or less this example is the same of \ref e3_gs_gray_scott_cs on gpu using what we learned in \ref e3_gs_gray_scott_gpu + * + * + */ + +#ifdef __NVCC__ + +constexpr int U = 0; +constexpr int V = 1; +constexpr int U_next = 2; +constexpr int V_next = 3; + +typedef sgrid_dist_id_gpu<3,double,aggregate<double,double,double,double> > sgrid_type; + +void init(sgrid_type & grid, Box<3,double> & domain) +{ + auto it = grid.getGridIterator(); + Point<3,double> p[8]= {{0.35,0.35,0.35}, + {0.35,2.0,2.0}, + {2.0,0.35,2.0}, + {2.0,2.0,0.35}, + {0.35,0.35,2.0}, + {0.35,2.0,0.35}, + {2.0,0.35,0.35}, + {2.0,2.0,2.0}}; + + +// Point<3,double> u({1.0,0.0,0.0}); +// Box<3,double> channel_box(p3,p1); + + double spacing_x = grid.spacing(0); + double spacing_y = grid.spacing(1); + double spacing_z = grid.spacing(2); + + typedef typename GetAddBlockType<sgrid_type>::type InsertBlockT; + + // Draw spheres + for (int i = 0 ; i < 8 ; i++) + { + Sphere<3,double> sph(p[i],0.3); + + Box<3,size_t> bx; + + for (int i = 0 ; i < 3 ; i++) + { + bx.setLow(i,(size_t)((sph.center(i) - 0.31)/grid.spacing(i))); + bx.setHigh(i,(size_t)((sph.center(i) + 0.31)/grid.spacing(i))); + } + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + + // Check if the point is in the domain + if (sph.isInside(pc) ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + + //channels + + Box<3,double> b({0.25,0.25,0.25},{2.1,2.1,2.1}); + + for (int k = 0 ; k < 3 ; k++) + { + for (int s = 0 ; s < 2 ; s++) + { + for (int i = 0 ; i < 2 ; i++) + { + Point<3,double> u({1.0*(((s+i)%2) == 0 && k != 2),1.0*(((s+i+1)%2) == 0 && k != 2),(k == 2)*1.0}); + Point<3,double> c({(i == 0)?0.35:2.0,(s == 0)?0.35:2.0,(k == 0)?0.35:2.0}); + + Box<3,size_t> bx; + + for (int i = 0 ; i < 3 ; i++) + { + if (c[i] == 2.0) + { + if (u[i] == 1.0) + { + bx.setLow(i,(size_t)(0.34/grid.spacing(i))); + bx.setHigh(i,(size_t)(2.01/grid.spacing(i))); + } + else + { + bx.setLow(i,(size_t)((c[i] - 0.11)/grid.spacing(i))); + bx.setHigh(i,(size_t)((c[i] + 0.11)/grid.spacing(i))); + } + } + else + { + if (u[i] == 1.0) + { + bx.setLow(i,(size_t)(0.34/grid.spacing(i))); + bx.setHigh(i,(size_t)(2.01/grid.spacing(i))); + } + else + { + bx.setLow(i,(size_t)((c[i] - 0.11)/grid.spacing(i))); + bx.setHigh(i,(size_t)((c[i] + 0.11)/grid.spacing(i))); + } + } + } + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c,b] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 && b.isInside(pcs) == true ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + } + + // cross channel + + int s = 0; + for (int s = 0 ; s < 2 ; s++) + { + for (int i = 0 ; i < 2 ; i++) + { + Point<3,double> c({(i == 0)?0.35:2.0,(s == 0)?0.35:2.0,0.35}); + Point<3,double> u({(i == 0)?1.0:-1.0,(s == 0)?1.0:-1.0,1.0}); + + Box<3,size_t> bx; + + for (int k = 0 ; k < 16; k++) + { + for (int s = 0 ; s < 3 ; s++) + { + if (u[s] > 0.0) + { + bx.setLow(s,(c[s] + k*(u[s]/9.0))/grid.spacing(s) ); + bx.setHigh(s,(c[s] + (k+3)*(u[s]/9.0) )/ grid.spacing(s) ); + } + else + { + bx.setLow(s,(c[s] + (k+3)*(u[s]/9.0) )/grid.spacing(s) ); + bx.setHigh(s,(c[s] + k*(u[s]/9.0))/ grid.spacing(s) ); + } + } + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c,b] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm() / sqrt(3.0); + + // Check if the point is in the domain + if (distance < 0.1 && b.isInside(pcs) == true ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + } + + long int x_start = grid.size(0)*1.95f/domain.getHigh(0); + long int y_start = grid.size(1)*1.95f/domain.getHigh(1); + long int z_start = grid.size(1)*1.95f/domain.getHigh(2); + + long int x_stop = grid.size(0)*2.05f/domain.getHigh(0); + long int y_stop = grid.size(1)*2.05f/domain.getHigh(1); + long int z_stop = grid.size(1)*2.05f/domain.getHigh(2); + + grid_key_dx<3> start({x_start,y_start,z_start}); + grid_key_dx<3> stop ({x_stop,y_stop,z_stop}); + + grid.addPoints(start,stop,[] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 0.5; + data.template get<V>() = 0.24; + } + ); + + grid.template flush<smin_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + + grid.removeUnusedBuffers(); +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {384,384,384}; + + // Define periodicity of the grid + periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + double deltaT = 0.2; + + // Diffusion constant for specie U + double du = 2*1e-5; + + // Diffusion constant for specie V + double dv = 1*1e-5; + +#ifdef TEST_RUN + // Number of timesteps + size_t timeSteps = 300; +#else + // Number of timesteps + size_t timeSteps = 50000; +#endif + + // K and F (Physical constant in the equation) + double K = 0.053; + double F = 0.014; + + sgrid_type grid(sz,domain,g,bc); + + grid.template setBackgroundValue<0>(-0.5); + grid.template setBackgroundValue<1>(-0.5); + grid.template setBackgroundValue<2>(-0.5); + grid.template setBackgroundValue<3>(-0.5); + + // spacing of the grid on x and y + double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain); + + // sync the ghost + grid.template ghost_get<U,V>(RUN_ON_DEVICE); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + double uFactor = deltaT * du/(spacing[0]*spacing[0]); + double vFactor = deltaT * dv/(spacing[0]*spacing[0]); + + grid.template deviceToHost<U,V>(); + + timer tot_sim; + tot_sim.start(); + + for (size_t i = 0; i < timeSteps; ++i) + { + //! \cond [stencil get and use] \endcond + + typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType; + + auto func = [uFactor,vFactor,deltaT,F,K] __device__ (double & u_out, double & v_out, + CpBlockType & u, CpBlockType & v, + int i, int j, int k){ + + double uc = u(i,j,k); + double vc = v(i,j,k); + + double u_px = u(i+1,j,k); + double u_mx = u(i-1,j,k); + + double u_py = u(i,j+1,k); + double u_my = u(i,j-1,k); + + double u_pz = u(i,j,k+1); + double u_mz = u(i,j,k-1); + + double v_px = v(i+1,j,k); + double v_mx = v(i-1,j,k); + + double v_py = v(i,j+1,k); + double v_my = v(i,j-1,k); + + double v_pz = v(i,j,k+1); + double v_mz = v(i,j,k-1); + + // U fix + + if (u_mx < -0.1 && u_px < -0.1) + { + u_mx = uc; + u_px = uc; + } + + if (u_mx < -0.1) + {u_mx = u_px;} + + if (u_px < -0.1) + {u_px = u_mx;} + + if (u_my < -0.1 && u_py < -0.1) + { + u_my = uc; + u_py = uc; + } + + if (u_my < -0.1) + {u_my = u_py;} + + if (u_py < -0.1) + {u_py = u_my;} + + if (u_mz < -0.1 && u_pz < -0.1) + { + u_mz = uc; + u_pz = uc; + } + + if (u_mz < -0.1) + {u_mz = u_pz;} + + if (u_pz < -0.1) + {u_pz = u_mz;} + + // V fix + + if (v_mx < -0.1 && v_px < -0.1) + { + v_mx = uc; + v_px = uc; + } + + if (v_mx < -0.1) + {v_mx = v_px;} + + if (v_px < -0.1) + {v_px = v_mx;} + + if (v_my < -0.1 && v_py < -0.1) + { + v_my = uc; + v_py = uc; + } + + if (v_my < -0.1) + {v_my = v_py;} + + if (v_py < -0.1) + {v_py = v_my;} + + if (v_mz < -0.1 && v_pz < -0.1) + { + v_mz = uc; + v_pz = uc; + } + + if (v_mz < -0.1) + {v_mz = v_pz;} + + if (v_pz < -0.1) + {v_pz = v_mz;} + + u_out = uc + uFactor *(u_mx + u_px + + u_my + u_py + + u_mz + u_pz - 6.0*uc) - deltaT * uc*vc*vc + - deltaT * F * (uc - 1.0); + + + v_out = vc + vFactor *(v_mx + v_px + + v_py + v_my + + v_mz + v_pz - 6.0*vc) + deltaT * uc*vc*vc + - deltaT * (F+K) * vc; + + }; + + if (i % 2 == 0) + { + grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + cudaDeviceSynchronize(); + + // After copy we synchronize again the ghost part U and V + + grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING); + } + else + { + grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + cudaDeviceSynchronize(); + + // After copy we synchronize again the ghost part U and V + grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING); + } + + //! \cond [stencil get and use] \endcond + + // After copy we synchronize again the ghost part U and V + + // Every 500 time step we output the configuration for + // visualization +/* if (i % 500 == 0) + { + grid.save("output_" + std::to_string(count)); + count++; + }*/ + + std::cout << "STEP: " << i << std::endl; +/* if (i % 300 == 0) + { + grid.template deviceToHost<U,V>(); + grid.write_frame("out",i); + }*/ + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + grid.print_stats(); + + create_vcluster().print_stats(); + + grid.template deviceToHost<U,V>(); + grid.write("Final"); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry + * + * # Full code # {#code} + * + * \include SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu + * + */ +} + +#else + +int main(int argc, char* argv[]) +{ + return 0; +} + +#endif + diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6239057a8dd94a2061672be96d85a91cb426ea7d --- /dev/null +++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/Makefile @@ -0,0 +1,26 @@ +include ../../example.mk + +CC=mpic++ + +LDIR = + +OBJ = main.o +gray_scott_sparse_cs_surface: OPT += -DTEST_RUN +gray_scott_sparse_cs_surface: gray_scott_sparse_surface_cs + +%.o: %.cpp + $(CC) -O3 -g $(OPT) -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + +gray_scott_sparse_surface_cs: $(OBJ) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + +all: gray_scott_sparse_surface_cs + +run: gray_scott_sparse_cs_surface + mpirun --oversubscribe -np 4 ./gray_scott_sparse_surface_cs + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_surface_cs + diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880 --- /dev/null +++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cpp Makefile diff --git a/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ab932e019c638d5203b0804982e50a8b7084ad87 --- /dev/null +++ b/example/SparseGrid/4_gray_scott_3d_sparse_surface_cs/main.cpp @@ -0,0 +1,426 @@ +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * + */ + +constexpr int U = 0; +constexpr int V = 1; +constexpr int phi = 2; +constexpr int normal = 3; +constexpr int tgrad_u = 4; +constexpr int tgrad_v = 5; +constexpr int U_next = 6; +constexpr int V_next = 7; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +typedef sgrid_dist_id<3,double,aggregate<double,double,double,double[3],double[3],double[3],double,double> > sgrid_type; + +void init(sgrid_type & grid, Box<3,double> & domain) +{ + //! \cond [init sphere channel] \endcond + + auto it = grid.getGridIterator(); + Point<3,double> p1({0.5,0.5,0.5}); + + double sx = grid.spacing(0); + + Sphere<3,double> sph1(p1,0.3); + Sphere<3,double> sph2(p1,0.3 - sx*10); + Sphere<3,double> sph_zero(p1,0.3 - sx*5); + + while (it.isNext()) + { + // Get the local grid key + auto key = it.get_dist(); + auto keyg = it.get(); + + Point<3,double> pc; + Point<3,double> vp; + + for (int i = 0 ; i < 3 ; i++) + {pc.get(i) = keyg.get(i) * it.getSpacing(i);} + + // Check if the point is in the first sphere + if (sph1.isInside(pc) == true && sph2.isInside(pc) == false) + { + Point<3,double> pn = pc - p1; + pn /= pn.norm(); + double theta = acos(pn * Point<3,double>({0.0,0.0,1.0})); + Point<3,double> pn_ = pn; + pn_[2] = 0.0; + pn_ /= pn_.norm(); + double aphi = acos(pn_ * Point<3,double>({1.0,0.0,0.0})); + + // Create a perturbation in the solid angle + if (theta > 0.6 && theta < 0.8 && aphi > 0.0 && aphi < 0.2) + { + grid.template insert<U>(key) = 0.5; + grid.template insert<V>(key) = 0.25; + } + else + { + grid.template insert<U>(key) = 1.0; + grid.template insert<V>(key) = 0.0; + } + grid.template insert<phi>(key) = sph_zero.distance(pc); + grid.template insert<normal>(key)[0] = pn[0]; + grid.template insert<normal>(key)[1] = pn[1]; + grid.template insert<normal>(key)[2] = pn[2]; + + // Old values U and V + grid.template insert<U_next>(key) = 0.0; + grid.template insert<V_next>(key) = 0.0; + } + + ++it; + } + + //! \cond [init sphere channel] \endcond +} + +template<unsigned int U_src,unsigned int V_src,unsigned int U_dst, unsigned int V_dst> +void extend(sgrid_type & grid) +{ + double delta = 1e-10; + double max = 0.0; + auto it = grid.getDomainIterator(); + + while (it.isNext()) + { + // center point + auto Cp = it.get(); + + // plus,minus X,Y,Z + auto mx = Cp.move(0,-1); + auto px = Cp.move(0,+1); + auto my = Cp.move(1,-1); + auto py = Cp.move(1,1); + auto mz = Cp.move(2,-1); + auto pz = Cp.move(2,1); + + double s = grid.get<phi>(Cp) / sqrt(fabs(grid.get<phi>(Cp)) + delta); + + double Uext = 0.0; + double Vext = 0.0; + + double dir = s*grid.get<normal>(Cp)[x]; + + if (dir > 0) + { + Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(mx)); + Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(mx)); + } + else if (dir < 0) + { + Uext += dir * (grid.get<U_src>(px) - grid.get<U_src>(Cp)); + Vext += dir * (grid.get<V_src>(px) - grid.get<V_src>(Cp)); + } + + + dir = s*grid.get<normal>(Cp)[y]; + if (dir > 0) + { + Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(my)); + Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(my)); + } + else if (dir < 0) + { + Uext += dir * (grid.get<U_src>(py) - grid.get<U_src>(Cp)); + Vext += dir * (grid.get<V_src>(py) - grid.get<V_src>(Cp)); + } + + dir = s*grid.get<normal>(Cp)[z]; + if (dir > 0) + { + Uext += dir * (grid.get<U_src>(Cp) - grid.get<U_src>(mz)); + Vext += dir * (grid.get<V_src>(Cp) - grid.get<V_src>(mz)); + } + else if (dir < 0) + { + Uext += dir * (grid.get<U_src>(pz) - grid.get<U_src>(Cp)); + Vext += dir * (grid.get<V_src>(pz) - grid.get<V_src>(Cp)); + } + + if (Uext >= max) + { + max = Uext; + } + + grid.insert<U_dst>(Cp) = grid.get<U_src>(Cp) - 1.0*Uext; + grid.insert<V_dst>(Cp) = grid.get<V_src>(Cp) - 1.0*Vext; + + // Next point in the grid + ++it; + } + + std::cout << "UEX max: " << max << std::endl; +} + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {512,512,512}; + + // Define periodicity of the grid + periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + double deltaT = 0.3; + + // Diffusion constant for specie U + double du = 1*1e-5; + + // Diffusion constant for specie V + double dv = 0.5*1e-5; + +#ifdef TEST_RUN + // Number of timesteps + size_t timeSteps = 200; +#else + // Number of timesteps + size_t timeSteps = 150000; +#endif + + // K and F (Physical constant in the equation) + double K = 0.053; + double F = 0.014; + + sgrid_type grid(sz,domain,g,bc); + + + // spacing of the grid on x and y + double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain); + + // sync the ghost + size_t count = 0; + grid.template ghost_get<U,V>(); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + double uFactor = deltaT * du; + double vFactor = deltaT * dv; + + auto & v_cl = create_vcluster(); + + timer tot_sim; + tot_sim.start(); + + for (size_t i = 0; i < timeSteps ; ++i) + { + { + auto it = grid.getDomainIterator(); + + while (it.isNext()) + { + // center point + auto Cp = it.get(); + + // plus,minus X,Y,Z + auto mx = Cp.move(0,-1); + auto px = Cp.move(0,+1); + auto my = Cp.move(1,-1); + auto py = Cp.move(1,1); + auto mz = Cp.move(2,-1); + auto pz = Cp.move(2,1); + + grid.insert<tgrad_u>(Cp)[0] = 0.0; + grid.insert<tgrad_u>(Cp)[1] = 0.0; + grid.insert<tgrad_u>(Cp)[2] = 0.0; + grid.insert<tgrad_v>(Cp)[0] = 0.0; + grid.insert<tgrad_v>(Cp)[1] = 0.0; + grid.insert<tgrad_v>(Cp)[2] = 0.0; + + //! \cond [boundary condition] \endcond + + if (grid.existPoint(mz) == true && grid.existPoint(pz) == true && + grid.existPoint(my) == true && grid.existPoint(py) == true && + grid.existPoint(mx) == true && grid.existPoint(px) == true ) + { + Point<3,double> gradU; + gradU[x] = (grid.get<U>(Cp) - grid.get<U>(mx)) / grid.spacing(0); + gradU[y] = (grid.get<U>(Cp) - grid.get<U>(my)) / grid.spacing(1); + gradU[z] = (grid.get<U>(Cp) - grid.get<U>(mz)) / grid.spacing(2); + + Point<3,double> gradV; + gradV[x] = (grid.get<V>(Cp) - grid.get<V>(mx)) / grid.spacing(0); + gradV[y] = (grid.get<V>(Cp) - grid.get<V>(my)) / grid.spacing(1); + gradV[z] = (grid.get<V>(Cp) - grid.get<V>(mz)) / grid.spacing(2); + + Point<3,double> PgradU; + Point<3,double> PgradV; + + PgradU.zero(); + PgradV.zero(); + + for (int i = 0 ; i < 3 ; i++) + { + for (int j = 0 ; j < 3 ; j++) + { + grid.insert<tgrad_u>(Cp)[i] += (((i == j)?1.0:0.0) - grid.get<normal>(Cp)[i]*grid.get<normal>(Cp)[j])*gradU[j]; + grid.insert<tgrad_v>(Cp)[i] += (((i == j)?1.0:0.0) - grid.get<normal>(Cp)[i]*grid.get<normal>(Cp)[j])*gradV[j]; + } + } + } + ++it; + } + } + +// Old.write_frame("Init_condition",i); + + { + auto it = grid.getDomainIterator(); + + while (it.isNext()) + { + // center point + auto Cp = it.get(); + + // plus,minus X,Y,Z + auto mx = Cp.move(0,-1); + auto px = Cp.move(0,+1); + auto my = Cp.move(1,-1); + auto py = Cp.move(1,1); + auto mz = Cp.move(2,-1); + auto pz = Cp.move(2,1); + + //! \cond [boundary condition] \endcond + + // Mirror z + + if (grid.existPoint(mz) == true && grid.existPoint(pz) == true && + grid.existPoint(my) == true && grid.existPoint(py) == true && + grid.existPoint(mx) == true && grid.existPoint(px) == true ) + { + double lapU = 0; + double lapV = 0; + + //Div + lapU += (grid.get<tgrad_u>(px)[0] - grid.get<tgrad_u>(Cp)[0]) / grid.spacing(0); + lapV += (grid.get<tgrad_v>(px)[0] - grid.get<tgrad_v>(Cp)[0]) / grid.spacing(0); + lapU += (grid.get<tgrad_u>(py)[1] - grid.get<tgrad_u>(Cp)[1]) / grid.spacing(1); + lapV += (grid.get<tgrad_v>(py)[1] - grid.get<tgrad_v>(Cp)[1]) / grid.spacing(1); + lapU += (grid.get<tgrad_u>(pz)[2] - grid.get<tgrad_u>(Cp)[2]) / grid.spacing(2); + lapV += (grid.get<tgrad_v>(pz)[2] - grid.get<tgrad_v>(Cp)[2]) / grid.spacing(2); + + // update based on Eq 2 + grid.insert<U_next>(Cp) = grid.get<U>(Cp) + uFactor * lapU + + - deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) + + - deltaT * F * (grid.get<U>(Cp) - 1.0); + + + // update based on Eq 2 + grid.insert<V_next>(Cp) = grid.get<V>(Cp) + vFactor * lapV + + deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) + + - deltaT * (F+K) * grid.get<V>(Cp); + } + + // Next point in the grid + ++it; + } + } + +// New.write_frame("update",i); + + // Extend + + if (i % 5 == 0) + { + for (int j = 0 ; j < 2 ; j++) + { + if (j % 2 == 0) + {extend<U_next,V_next,U,V>(grid);} + else + {extend<U,V,U_next,V_next>(grid);} + + // Here we copy New into the old grid in preparation of the new step + // It would be better to alternate, but using this we can show the usage + // of the function copy. To note that copy work only on two grid of the same + // decomposition. If you want to copy also the decomposition, or force to be + // exactly the same, use Old = New + //New.copy_sparse(Old); + } + } + +/* auto it = grid.getDomainIterator(); + + while (it.isNext()) + { + // center point + auto Cp = it.get(); + + // update based on Eq 2 + grid.insert<U>(Cp) = grid.get<U_next>(Cp); + grid.insert<V>(Cp) = grid.get<V_next>(Cp); + + ++it; + }*/ + + //! \cond [stencil get and use] \endcond + + // After copy we synchronize again the ghost part U and V + grid.ghost_get<U,V>(); + + // Every 500 time step we output the configuration for + // visualization + if (i % 500 == 0) + { + grid.save("output_" + std::to_string(count)); + count++; + } + + if (v_cl.rank() == 0) + {std::cout << "STEP: " << i << " " << std::endl;} + if (i % 100 == 0) + { + grid.write_frame("out",i); + } + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d/main.cpp + * + */ +} diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..42f2b8bf960d023c4ba81edb7a4dec7d8fe0e3aa --- /dev/null +++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/Makefile @@ -0,0 +1,26 @@ +include ../../example.mk + +CC=mpic++ + +LDIR = + +OBJ = main.o +gray_scott_sparse_surface_cs_test: OPT += -DTEST_RUN +gray_scott_sparse_surface_cs_test: gray_scott_sparse_surface_cs + +%.o: %.cpp + $(CC) -O3 -g $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH) + +gray_scott_sparse_surface_cs: $(OBJ) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + +all: gray_scott_sparse_surface_cs + +run: gray_scott_sparse_surface_cs_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_surface_cs + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_surface_cs + diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880 --- /dev/null +++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cpp Makefile diff --git a/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7c77eb62e416fb9bbf362ec0c3dffdfa5f1bdfb7 --- /dev/null +++ b/example/SparseGrid/5_gray_scott_3d_surface_cs_opt/main.cpp @@ -0,0 +1,472 @@ +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + */ + +constexpr int U = 0; +constexpr int V = 1; +constexpr int phi = 2; +constexpr int normal = 3; +constexpr int tgrad_u = 4; +constexpr int tgrad_v = 5; +constexpr int U_next = 6; +constexpr int V_next = 7; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +typedef sgrid_dist_soa<3,double,aggregate<double,double,double,double[3],double[3],double[3],double,double> > sgrid_type; + +void init(sgrid_type & grid, Box<3,double> & domain) +{ + //! \cond [init sphere channel] \endcond + + auto it = grid.getGridIterator(); + Point<3,double> p1({0.5,0.5,0.5}); + + double sx = grid.spacing(0); + + Sphere<3,double> sph1(p1,0.3); + Sphere<3,double> sph2(p1,0.3 - sx*10); + Sphere<3,double> sph_zero(p1,0.3 - sx*5); + + while (it.isNext()) + { + // Get the local grid key + auto key = it.get_dist(); + auto keyg = it.get(); + + Point<3,double> pc; + Point<3,double> vp; + + for (int i = 0 ; i < 3 ; i++) + {pc.get(i) = keyg.get(i) * it.getSpacing(i);} + + // Check if the point is in the first sphere + if (sph1.isInside(pc) == true && sph2.isInside(pc) == false) + { + Point<3,double> pn = pc - p1; + pn /= pn.norm(); + double theta = acos(pn * Point<3,double>({0.0,0.0,1.0})); + Point<3,double> pn_ = pn; + pn_[2] = 0.0; + pn_ /= pn_.norm(); + double aphi = acos(pn_ * Point<3,double>({1.0,0.0,0.0})); + + // Create a perturbation in the solid angle + if (theta > 0.6 && theta < 0.8 && aphi > 0.0 && aphi < 0.2) + { + grid.template insert<U>(key) = 0.5; + grid.template insert<V>(key) = 0.25; + } + else + { + grid.template insert<U>(key) = 1.0; + grid.template insert<V>(key) = 0.0; + } + grid.template insert<phi>(key) = sph_zero.distance(pc); + grid.template insert<normal>(key)[0] = pn[0]; + grid.template insert<normal>(key)[1] = pn[1]; + grid.template insert<normal>(key)[2] = pn[2]; + + // Old values U and V + grid.template insert<U_next>(key) = 0.0; + grid.template insert<V_next>(key) = 0.0; + } + + ++it; + } + + //! \cond [init sphere channel] \endcond +} + +template<unsigned int U_src,unsigned int V_src,unsigned int U_dst, unsigned int V_dst> +void extend(sgrid_type & grid, size_t (& sz)[3],double (& spacing)[3]) +{ + double delta = 1e-10; + double max = 0.0; + + auto func_extend = [delta,&spacing](auto & grid, auto & ids, + unsigned char * mask_sum) + { + Vc::double_v phi_c; + Vc::double_v s; + + Vc::double_v Uext = 0.0; + Vc::double_v Vext = 0.0; + + Vc::double_v n[3]; + Vc::double_v dir; + + Vc::double_v Uc; + Vc::double_v Vc; + Vc::double_v Uc_xm; + Vc::double_v Vc_xm; + Vc::double_v Uc_ym; + Vc::double_v Vc_ym; + Vc::double_v Uc_zm; + Vc::double_v Vc_zm; + + Vc::double_v Uc_xp; + Vc::double_v Vc_xp; + Vc::double_v Uc_yp; + Vc::double_v Vc_yp; + Vc::double_v Uc_zp; + Vc::double_v Vc_zp; + + load_crs<x,0,phi>(phi_c,grid,ids); + load_crs_v<x,0,x,normal>(n[x],grid,ids); + load_crs_v<x,0,y,normal>(n[y],grid,ids); + load_crs_v<x,0,z,normal>(n[z],grid,ids); + + load_crs<x,0,U_src>(Uc,grid,ids); + load_crs<x,0,V_src>(Vc,grid,ids); + load_crs<x,-1,U_src>(Uc_xm,grid,ids); + load_crs<x,-1,V_src>(Vc_xm,grid,ids); + load_crs<y,-1,U_src>(Uc_ym,grid,ids); + load_crs<y,-1,V_src>(Vc_ym,grid,ids); + load_crs<z,-1,U_src>(Uc_zm,grid,ids); + load_crs<z,-1,V_src>(Vc_zm,grid,ids); + load_crs<x,1,U_src>(Uc_xp,grid,ids); + load_crs<x,1,V_src>(Vc_xp,grid,ids); + load_crs<y,1,U_src>(Uc_yp,grid,ids); + load_crs<y,1,V_src>(Vc_yp,grid,ids); + load_crs<z,1,U_src>(Uc_zp,grid,ids); + load_crs<z,1,V_src>(Vc_zp,grid,ids); + + s = phi_c / sqrt(phi_c*phi_c + delta*delta); + + dir = s*n[0]; + auto dir_pos = dir > 0; + auto dir_neg = dir < 0; + + Uext += Vc::iif(dir_pos,dir * (Uc - Uc_xm)/spacing[0],Vc::double_v(0.0)); + Vext += Vc::iif(dir_pos,dir * (Vc - Vc_xm)/spacing[0],Vc::double_v(0.0)); + Uext += Vc::iif(dir_neg,dir * (Uc_xp - Uc)/spacing[0],Vc::double_v(0.0)); + Vext += Vc::iif(dir_neg,dir * (Vc_xp - Vc)/spacing[0],Vc::double_v(0.0)); + + dir = s*n[1]; + dir_pos = dir > 0; + dir_neg = dir < 0; + + Uext += Vc::iif(dir_pos,dir * (Uc - Uc_ym)/spacing[1],Vc::double_v(0.0)); + Vext += Vc::iif(dir_pos,dir * (Vc - Vc_ym)/spacing[1],Vc::double_v(0.0)); + Uext += Vc::iif(dir_neg,dir * (Uc_yp - Uc)/spacing[1],Vc::double_v(0.0)); + Vext += Vc::iif(dir_neg,dir * (Vc_yp - Vc)/spacing[1],Vc::double_v(0.0)); + + dir = s*n[2]; + dir_pos = dir > 0; + dir_neg = dir < 0; + + Uext += Vc::iif(dir_pos,dir * (Uc - Uc_zm)/spacing[2],Vc::double_v(0.0)); + Vext += Vc::iif(dir_pos,dir * (Vc - Vc_zm)/spacing[2],Vc::double_v(0.0)); + Uext += Vc::iif(dir_neg,dir * (Uc_zp - Uc)/spacing[2],Vc::double_v(0.0)); + Vext += Vc::iif(dir_neg,dir * (Vc_zp - Vc)/spacing[2],Vc::double_v(0.0)); + + Uext = Uc - 0.0003*Uext; + Vext = Vc - 0.0003*Vext; + + store_crs<U_dst>(grid,Uext,ids); + store_crs<V_dst>(grid,Vext,ids); + }; + + grid.template conv_cross_ids<1,double>({0,0,0},{sz[0] - 1, sz[1] - 1, sz[2] - 1},func_extend); +} + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {512,512,512}; + + // Define periodicity of the grid + periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + double deltaT = 0.3; + + // Diffusion constant for specie U + double du = 1*1e-5; + + // Diffusion constant for specie V + double dv = 0.5*1e-5; + +#ifdef TEST_RUN + // Number of timesteps + size_t timeSteps = 200; +#else + // Number of timesteps + size_t timeSteps = 100000; +#endif + + // K and F (Physical constant in the equation) + double K = 0.053; + double F = 0.014; + + sgrid_type grid(sz,domain,g,bc); + + + // spacing of the grid on x and y + double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain); + + // sync the ghost + size_t count = 0; + grid.template ghost_get<U,V>(); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + double uFactor = deltaT * du; + double vFactor = deltaT * dv; + + auto & v_cl = create_vcluster(); + + timer tot_sim; + tot_sim.start(); + + for (size_t i = 0; i < timeSteps ; ++i) + { + auto func_grad = [&spacing](auto & grid, auto & ids, + unsigned char * mask_sum){ + + Vc::double_v n[3]; + + Vc::double_v Uc; + Vc::double_v xmU; + Vc::double_v ymU; + Vc::double_v zmU; + + Vc::double_v Vc; + Vc::double_v xmV; + Vc::double_v ymV; + Vc::double_v zmV; + + Vc::double_v u_out[3]; + Vc::double_v v_out[3]; + + load_crs<x,-1,U>(xmU,grid,ids); + load_crs<y,-1,U>(ymU,grid,ids); + load_crs<z,-1,U>(zmU,grid,ids); + load_crs<x,0,U>(Uc,grid,ids); + + load_crs<x,-1,V>(xmV,grid,ids); + load_crs<y,-1,V>(ymV,grid,ids); + load_crs<z,-1,V>(zmV,grid,ids); + load_crs<x,0,V>(Vc,grid,ids); + + load_crs_v<x,0,x,normal>(n[x],grid,ids); + load_crs_v<x,0,y,normal>(n[y],grid,ids); + load_crs_v<x,0,z,normal>(n[z],grid,ids); + + u_out[0] = (1.0-n[0]*n[0])*(Uc - xmU)/spacing[0] + (-n[1]*n[1])*(Uc - ymU)/spacing[1] + (-n[2]*n[2])*(Uc - zmU)/spacing[2]; + u_out[1] = (-n[0]*n[0])*(Uc - xmU)/spacing[0] + (1.0-n[1]*n[1])*(Uc - ymU)/spacing[1] + (-n[2]*n[2])*(Uc - zmU)/spacing[2]; + u_out[2] = (-n[0]*n[0])*(Uc - xmU)/spacing[0] + (-n[1]*n[1])*(Uc - ymU)/spacing[1] + (1.0-n[2]*n[2])*(Uc - zmU)/spacing[2]; + + v_out[0] = (1.0-n[0]*n[0])*(Vc - xmV)/spacing[0] + (-n[1]*n[1])*(Vc - ymV)/spacing[1] + (-n[2]*n[2])*(Vc - zmV)/spacing[2]; + v_out[1] = (-n[0]*n[0])*(Vc - xmV)/spacing[0] + (1.0-n[1]*n[1])*(Vc - ymV)/spacing[1] + (-n[2]*n[2])*(Vc - zmV)/spacing[2]; + v_out[2] = (-n[0]*n[0])*(Vc - xmV)/spacing[0] + (-n[1]*n[1])*(Vc - ymV)/spacing[1] + (1.0-n[2]*n[2])*(Vc - zmV)/spacing[2]; + + Vc::Mask<double> surround; + + for (int i = 0 ; i < Vc::double_v::Size ; i++) + {surround[i] = (mask_sum[i] == 6);} + + u_out[0] = Vc::iif(surround,u_out[0],Vc::double_v(0.0)); + u_out[1] = Vc::iif(surround,u_out[1],Vc::double_v(0.0)); + u_out[2] = Vc::iif(surround,u_out[2],Vc::double_v(0.0)); + + v_out[0] = Vc::iif(surround,v_out[0],Vc::double_v(0.0)); + v_out[1] = Vc::iif(surround,v_out[1],Vc::double_v(0.0)); + v_out[2] = Vc::iif(surround,v_out[2],Vc::double_v(0.0)); + + store_crs_v<tgrad_u,x>(grid,u_out[0],ids); + store_crs_v<tgrad_u,y>(grid,u_out[1],ids); + store_crs_v<tgrad_u,z>(grid,u_out[2],ids); + + store_crs_v<tgrad_v,x>(grid,v_out[0],ids); + store_crs_v<tgrad_v,y>(grid,v_out[1],ids); + store_crs_v<tgrad_v,z>(grid,v_out[2],ids); + }; + + grid.template conv_cross_ids<1,double>({0,0,0},{sz[0]-1,sz[1] - 1,sz[2] - 1},func_grad); + + auto func_lap = [&spacing,uFactor,vFactor,deltaT,K,F](auto & grid, auto & ids, + unsigned char * mask_sum){ + + Vc::double_v gradU_px; + Vc::double_v gradU_py; + Vc::double_v gradU_pz; + + Vc::double_v gradU_x; + Vc::double_v gradU_y; + Vc::double_v gradU_z; + + Vc::double_v gradV_px; + Vc::double_v gradV_py; + Vc::double_v gradV_pz; + + Vc::double_v gradV_x; + Vc::double_v gradV_y; + Vc::double_v gradV_z; + + Vc::double_v lapU; + Vc::double_v lapV; + + Vc::double_v Uc; + Vc::double_v Vc; + + Vc::double_v outU; + Vc::double_v outV; + + load_crs_v<x,1,x,tgrad_u>(gradU_px,grid,ids); + load_crs_v<y,1,y,tgrad_u>(gradU_py,grid,ids); + load_crs_v<z,1,z,tgrad_u>(gradU_pz,grid,ids); + + load_crs_v<x,0,x,tgrad_u>(gradU_x,grid,ids); + load_crs_v<x,0,y,tgrad_u>(gradU_y,grid,ids); + load_crs_v<x,0,z,tgrad_u>(gradU_z,grid,ids); + + load_crs_v<x,1,x,tgrad_v>(gradV_px,grid,ids); + load_crs_v<y,1,y,tgrad_v>(gradV_py,grid,ids); + load_crs_v<z,1,z,tgrad_v>(gradV_pz,grid,ids); + + load_crs_v<x,0,x,tgrad_v>(gradV_x,grid,ids); + load_crs_v<x,0,y,tgrad_v>(gradV_y,grid,ids); + load_crs_v<x,0,z,tgrad_v>(gradV_z,grid,ids); + + load_crs<x,0,U>(Uc,grid,ids); + load_crs<x,0,V>(Vc,grid,ids); + + lapU += (gradU_px - gradU_x) / spacing[0]; + lapV += (gradV_px - gradV_x) / spacing[0]; + lapU += (gradU_py - gradU_y) / spacing[1]; + lapV += (gradV_py - gradV_y) / spacing[1]; + lapU += (gradU_pz - gradU_z) / spacing[2]; + lapV += (gradV_pz - gradV_z) / spacing[2]; + + // update based on Eq 2 + outU = Uc + uFactor * lapU + + - deltaT * Uc * Vc * Vc + + - deltaT * F * (Uc - 1.0); + + + // update based on Eq 2 + outV = Vc + vFactor * lapV + + deltaT * Uc * Vc * Vc + + - deltaT * (F+K) * Vc; + + Vc::Mask<double> surround; + + for (int i = 0 ; i < Vc::double_v::Size ; i++) + {surround[i] = (mask_sum[i] == 6);} + + + outU = Vc::iif(surround,outU,Uc); + outV = Vc::iif(surround,outV,Vc); + + store_crs<U_next>(grid,outU,ids); + store_crs<V_next>(grid,outV,ids); + }; + + grid.template conv_cross_ids<1,double>({0,0,0},{sz[0]-1,sz[1] - 1,sz[2] - 1},func_lap); + +// New.write_frame("update",i); + + // Extend + + if (i % 5 == 0) + { + for (int j = 0 ; j < 2 ; j++) + { + if (j % 2 == 0) + {extend<U_next,V_next,U,V>(grid,sz,spacing);} + else + {extend<U,V,U_next,V_next>(grid,sz,spacing);} + + // Here we copy New into the old grid in preparation of the new step + // It would be better to alternate, but using this we can show the usage + // of the function copy. To note that copy work only on two grid of the same + // decomposition. If you want to copy also the decomposition, or force to be + // exactly the same, use Old = New + //New.copy_sparse(Old); + } + } + +/* auto it = grid.getDomainIterator(); + + while (it.isNext()) + { + // center point + auto Cp = it.get(); + + // update based on Eq 2 + grid.insert<U>(Cp) = grid.get<U_next>(Cp); + grid.insert<V>(Cp) = grid.get<V_next>(Cp); + + ++it; + }*/ + + //! \cond [stencil get and use] \endcond + + // After copy we synchronize again the ghost part U and V + grid.ghost_get<U,V>(); + + // Every 500 time step we output the configuration for + // visualization + if (i % 500 == 0) + { +// grid.save("output_" + std::to_string(count)); + count++; + } + + if (v_cl.rank() == 0) + {std::cout << "STEP: " << i << " " << std::endl;} + if (i % 1000 == 0) + { + grid.write_frame("out",i); + } + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d/main.cpp + * + */ +} diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..25e2b3240d812248d2fb5edb4856844f86f296b8 --- /dev/null +++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/Makefile @@ -0,0 +1,45 @@ +include ../../example.mk + +### internally the example disable with the preprocessor its code if not compiled with nvcc +CUDA_CC= +CUDA_CC_LINK= +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ +else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + endif +endif + +gray_scott_sparse_gpu_test: OPT += -DTEST_RUN +gray_scott_sparse_gpu_test: gray_scott_sparse_gpu + +CC=mpic++ + +LDIR = + +OBJ = main.o + +%.o: %.cu + $(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +gray_scott_sparse_gpu: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS) + +all: gray_scott_sparse_gpu + +run: gray_scott_sparse_gpu_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_gpu + diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..b0c2804c4b0db46c709a2e3bfcb780f63199e6b4 --- /dev/null +++ b/example/SparseGrid/6_gray_scott_3d_sparse_gpu_opt_weak_scal/main.cu @@ -0,0 +1,504 @@ +#include "Decomposition/Distribution/BoxDistribution.hpp" +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU + * + * [TOC] + * + * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu + * + * In figure is the final solution of the problem + * + * \htmlonly + * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/> + * \endhtmlonly + * + * More or less this example is the adaptation of the dense example in 3D + * + * \see \ref Grid_3_gs_3D + * + * # Initializetion + * + * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D) + * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is + * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted. + * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k + * + * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters + * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max + * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points + * + * The function can also called with a specified range + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub + * + * # Update + * + * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve + * two properties + * + * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen. + * + * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the + * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the + * last is the extension of the stencil. In this case we use 1. + * + * The lambda function is defined as + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda + * + * and used in the body loop + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body + * + */ + +#ifdef __NVCC__ + +constexpr int U = 0; +constexpr int V = 1; + +constexpr int U_next = 2; +constexpr int V_next = 3; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec; + +typedef sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>,CudaMemory, Dec> SparseGridType; + +void init(SparseGridType & grid, Box<3,float> & domain, size_t (& div)[3]) +{ + //! \cond [create points] \endcond + + double spacing_x = grid.spacing(0); + double spacing_y = grid.spacing(1); + double spacing_z = grid.spacing(2); + + typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT; + + // Get the processor domain in continuos + + for (int i = 0 ; i < div[0] ; i++) + { + for (int j = 0 ; j < div[1] ; j++) + { + for (int k = 0 ; k < div[2] ; k++) + { + Point<3,double> p({0.5+i*1.0,0.5+j*1.0,0.5+k*1.0}); + Sphere<3,double> sph(p,0.3); + + Box<3,size_t> bx; + + for (int s = 0 ; s < 3 ; s++) + { + bx.setLow(s,(size_t)((sph.center(s) - 0.31)/grid.spacing(s))); + bx.setHigh(s,(size_t)((sph.center(s) + 0.31)/grid.spacing(s))); + } + + grid.addPoints([spacing_x,spacing_y,spacing_z,sph] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + + // Check if the point is in the domain + if (sph.isInside(pc) ) + {return true;} + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + } + + for (int i = 0 ; i < div[0] ; i++) + { + for (int j = 0 ; j < div[1] ; j++) + { + Point<3,double> u({0.0,0.0,1.0}); + Point<3,double> c({0.5+i,0.5+j,0.0}); + + Box<3,size_t> bx; + + bx.setLow(0,(0.4+i)/spacing_x); + bx.setHigh(0,(0.6+i)/spacing_x); + + bx.setLow(1,(0.4+j)/spacing_y); + bx.setHigh(1,(0.6+j)/spacing_y); + + bx.setLow(2,0); + bx.setHigh(2,(size_t)grid.size(2)); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + for (int i = 0 ; i < div[0] ; i++) + { + for (int k = 0 ; k < div[2] ; k++) + { + Point<3,double> u({0.0,1.0,0.0}); + Point<3,double> c({0.5+i,0.0,0.5+k}); + + Box<3,size_t> bx; + + bx.setLow(0,(0.4+i)/spacing_x); + bx.setHigh(0,(0.6+i)/spacing_x); + + bx.setLow(2,(0.4+k)/spacing_z); + bx.setHigh(2,(0.6+k)/spacing_z); + + bx.setLow(1,0); + bx.setHigh(1,(size_t)grid.size(1)); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + for (int j = 0 ; j < div[1] ; j++) + { + for (int k = 0 ; k < div[2] ; k++) + { + Point<3,double> u({1.0,0.0,0.0}); + Point<3,double> c({0.0,0.5+j,0.5+k}); + + Box<3,size_t> bx; + + bx.setLow(1,(0.4+j)/spacing_y); + bx.setHigh(1,(0.6+j)/spacing_y); + + bx.setLow(2,(0.4+k)/spacing_z); + bx.setHigh(2,(0.6+k)/spacing_z); + + bx.setLow(0,0); + bx.setHigh(0,(size_t)grid.size(0)); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,u,c] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> pcs({i*spacing_x,j*spacing_y,k*spacing_z}); + Point<3,double> vp; + + // shift + pc -= c; + + // calculate the distance from the diagonal + vp.get(0) = pc.get(1)*u.get(2) - pc.get(2)*u.get(1); + vp.get(1) = pc.get(2)*u.get(0) - pc.get(0)*u.get(2); + vp.get(2) = pc.get(0)*u.get(1) - pc.get(1)*u.get(0); + + double distance = vp.norm(); + + // Check if the point is in the domain + if (distance < 0.1 ) + {return true;} + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + grid.removeUnusedBuffers(); + } + } + + //! \cond [create points] \endcond + + long int x_start = grid.size(0)*0.4f/domain.getHigh(0); + long int y_start = grid.size(1)*0.4f/domain.getHigh(1); + long int z_start = grid.size(1)*0.4f/domain.getHigh(2); + + long int x_stop = grid.size(0)*0.6f/domain.getHigh(0); + long int y_stop = grid.size(1)*0.6f/domain.getHigh(1); + long int z_stop = grid.size(1)*0.6f/domain.getHigh(2); + + //! \cond [create points sub] \endcond + + grid_key_dx<3> start({x_start,y_start,z_start}); + grid_key_dx<3> stop ({x_stop,y_stop,z_stop}); + + grid.addPoints(start,stop,[] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 0.5; + data.template get<V>() = 0.24; + } + ); + + grid.template flush<smin_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + + //! \cond [create points sub] \endcond +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // First we check which type of decomposition BoxDistritubion prodice + auto & v_cl = create_vcluster(); + + openfpm::vector<int> facts; + getPrimeFactors(v_cl.size(),facts); + + size_t div[3]; + + for (int i = 0 ; i < 3 ; i++) + {div[i] = 1;} + + for (int i = 0 ; i < facts.size() ; i++) + {div[i % 3] *= facts.get(i);} + + grid_sm<3,void> gdist(div); + + // domain + Box<3,float> domain({0.0,0.0,0.0},{div[0]*1.0f,div[1]*1.0f,div[2]*1.0f}); + + // grid size + size_t sz[3] = {64*div[0],64*div[1],64*div[2]}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + float deltaT = 0.25; + + // Diffusion constant for specie U + float du = 2*1e-5; + + // Diffusion constant for specie V + float dv = 1*1e-5; + + // Number of timesteps +#ifdef TEST_RUN + size_t timeSteps = 300; +#else + size_t timeSteps = 15000; +#endif + + // K and F (Physical constant in the equation) + float K = 0.053; + float F = 0.014; + + SparseGridType grid(sz,domain,g,bc,0,gdist); + + // spacing of the grid on x and y + float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain,div); + + grid.deviceToHost<U,V,U_next,V_next>(); + grid.write("final"); + + openfpm_finalize(); + return 0; + + // sync the ghost + grid.template ghost_get<U,V>(RUN_ON_DEVICE); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + float uFactor = deltaT * du/(spacing[x]*spacing[x]); + float vFactor = deltaT * dv/(spacing[x]*spacing[x]); + + timer tot_sim; + tot_sim.start(); + + for (size_t i = 0; i < timeSteps ; ++i) + { + if (v_cl.rank() == 0) + {std::cout << "STEP: " << i << std::endl;} +/* if (i % 300 == 0) + { + std::cout << "STEP: " << i << std::endl; + grid.write_frame("out",i,VTK_WRITER); + }*/ + + //! \cond [stencil get and use] \endcond + + typedef typename GetCpBlockType<decltype(grid),0,1>::type CpBlockType; + + //! \cond [lambda] \endcond + + auto func = [uFactor,vFactor,deltaT,F,K] __device__ (float & u_out, float & v_out, + CpBlockType & u, CpBlockType & v, + int i, int j, int k){ + + float uc = u(i,j,k); + float vc = v(i,j,k); + + u_out = uc + uFactor *(u(i-1,j,k) + u(i+1,j,k) + + u(i,j-1,k) + u(i,j+1,k) + + u(i,j,k-1) + u(i,j,k+1) - 6.0f*uc) - deltaT * uc*vc*vc + - deltaT * F * (uc - 1.0f); + + + v_out = vc + vFactor *(v(i-1,j,k) + v(i+1,j,k) + + v(i,j+1,k) + v(i,j-1,k) + + v(i,j,k-1) + v(i,j,k+1) - 6.0f*vc) + deltaT * uc*vc*vc + - deltaT * (F+K) * vc; + }; + + //! \cond [lambda] \endcond + + //! \cond [body] \endcond + + if (i % 2 == 0) + { + grid.conv2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + // After copy we synchronize again the ghost part U and V + + grid.ghost_get<U_next,V_next>(RUN_ON_DEVICE | SKIP_LABELLING); + } + else + { + grid.conv2<U_next,V_next,U,V,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + // After copy we synchronize again the ghost part U and V + grid.ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING); + } + + //! \cond [body] \endcond + + // Every 500 time step we output the configuration for + // visualization +// if (i % 500 == 0) +// { +// grid.save("output_" + std::to_string(count)); +// count++; +// } + } + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + grid.deviceToHost<U,V,U_next,V_next>(); + grid.write("final"); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d/main.cpp + * + */ +} + +#else + +int main(int argc, char* argv[]) +{ + return 0; +} + +#endif + diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..15133370060405ef2c77e59de99de4d9ecd6eac5 --- /dev/null +++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/Makefile @@ -0,0 +1,52 @@ +include ../../example.mk + +### internally the example disable with the preprocessor its code if not compiled with nvcc + +CUDA_CC= +CUDA_CC_LINK= +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) +else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif + LIBS_SELECT=$(LIBS) +endif + + +gray_scott_sparse_gpu_test: OPT += -DTEST_RUN +gray_scott_sparse_gpu_test: gray_scott_sparse_gpu + +CC=mpic++ + +LDIR = + +OBJ = main.o + +%.o: %.cu + $(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +gray_scott_sparse_gpu: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT) + +all: gray_scott_sparse_gpu + +run: gray_scott_sparse_gpu_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_gpu + diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..04904f3933a23d4ad7372cc480a98b8759934c26 --- /dev/null +++ b/example/SparseGrid/7_gray_scott_3d_sparse_gpu_sphere_expanding/main.cu @@ -0,0 +1,284 @@ +#define SYNC_BEFORE_TAKE_TIME +#include "Decomposition/Distribution/BoxDistribution.hpp" +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry + * + * [TOC] + * + * # Solving a gray scott-system in 3D using Sparse grids# {#e3_gs_gray_scott} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu with complex geometry + * + * In figure is the final solution of the problem + * + * \htmlonly +<table border="1" bgcolor="black"> + <tr> + <td> + <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs_section.png" style="width: 500px;" /> + </td> + <td> + <img src="http://ppmcore.mpi-cbg.de/web/images/examples/1_gray_scott_3d_sparse_cs/gs_3d_sparse_cs.png" style="width: 500px;" /> + </td> + </tr> +</table> +\endhtmlonly + * + * More or less this example is the same of \ref e3_gs_gray_scott_cs on gpu using what we learned in \ref e3_gs_gray_scott_gpu + * + * + */ + +#ifdef __NVCC__ + +constexpr int U = 0; +constexpr int V = 1; +constexpr int U_next = 2; +constexpr int V_next = 3; + +typedef CartDecomposition<3,double, CudaMemory, memory_traits_inte, BoxDistribution<3,double> > Dec; + +typedef sgrid_dist_id_gpu<3,double,aggregate<double,double,double,double>, CudaMemory,Dec > sgrid_type; + +void draw_oscillation_shock(sgrid_type & grid, Box<3,double> & domain) +{ + auto it = grid.getGridIterator(); + Point<3,double> p({1.25,1.25,1.25}); + + +// Point<3,double> u({1.0,0.0,0.0}); +// Box<3,double> channel_box(p3,p1); + + double spacing_x = grid.spacing(0); + double spacing_y = grid.spacing(1); + double spacing_z = grid.spacing(2); + + typedef typename GetAddBlockType<sgrid_type>::type InsertBlockT; + + // Draw a shock expanding from 0.4 to 0.8 and than contracting from 0.8 to 0.4 + for (int i = 0 ; i < 100 ; i++) + { + Sphere<3,double> sph(p,0.2 + (double)i/160.0); + Sphere<3,double> sph2(p,0.4 + (double)i/160.0); + + Box<3,size_t> bx; + + for (int j = 0 ; j < 3 ; j++) + { + bx.setLow(j,(size_t)((sph.center(j) - 0.4 - (double)i/160.0)/grid.spacing(j))); + bx.setHigh(j,(size_t)((sph.center(j) + 0.4 + (double)i/160.0)/grid.spacing(j))); + } + + timer t_add; + t_add.start(); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph,sph2] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + + // Check if the point is in the domain + if (sph2.isInside(pc) ) + { + if (sph.isInside(pc) == false) + {return true;} + } + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + t_add.stop(); + + timer t_flush; + t_flush.start(); + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + t_flush.stop(); + + timer t_ghost; + t_ghost.start(); + grid.template ghost_get<U,V>(RUN_ON_DEVICE); + t_ghost.stop(); + timer t_ghost2; + t_ghost2.start(); + grid.template ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING); + t_ghost2.stop(); + std::cout << t_ghost.getwct() << std::endl; + + std::cout << "TIME ghost1: " << t_ghost.getwct() << " ghost2: " << t_ghost2.getwct() << " flush: " << t_flush.getwct() << " " << std::endl; + + + grid.removeUnusedBuffers(); + + } + + std::cout << "Second Pass" <<std::endl; + + for (int i = 0 ; i < 100 ; i++) + { + Sphere<3,double> sph(p,0.2 + (double)i/160.0); + Sphere<3,double> sph2(p,0.4 + (double)i/160.0); + + Box<3,size_t> bx; + + for (int j = 0 ; j < 3 ; j++) + { + bx.setLow(j,(size_t)((sph.center(j) - 0.4 - (double)i/160.0)/grid.spacing(j))); + bx.setHigh(j,(size_t)((sph.center(j) + 0.4 + (double)i/160.0)/grid.spacing(j))); + } + + timer t_add; + t_add.start(); + + grid.addPoints(bx.getKP1(),bx.getKP2(),[spacing_x,spacing_y,spacing_z,sph,sph2] __device__ (int i, int j, int k) + { + Point<3,double> pc({i*spacing_x,j*spacing_y,k*spacing_z}); + + // Check if the point is in the domain + if (sph2.isInside(pc) ) + { + if (sph.isInside(pc) == false) + {return true;} + } + + return false; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + data.template get<V>() = 0.0; + } + ); + + t_add.stop(); + + + timer t_flush; + t_flush.start(); + grid.template flush<smax_<U>,smax_<V>>(flush_type::FLUSH_ON_DEVICE); + t_flush.stop(); +// grid.removeUnusedBuffers(); + + + timer t_ghost; + t_ghost.start(); + grid.template ghost_get<U,V>(RUN_ON_DEVICE); + t_ghost.stop(); + timer t_ghost2; + t_ghost2.start(); + grid.template ghost_get<U,V>(RUN_ON_DEVICE | SKIP_LABELLING); + t_ghost2.stop(); + + std::cout << "TIME ghost1: " << t_ghost.getwct() << " ghost2: " << t_ghost2.getwct() << " flush: " << t_flush.getwct() << " " << std::endl; + +// if (i % 10 == 0) +// { +// grid.template deviceToHost<U,V>(); +// grid.write_frame("Final",i); +// } + } + +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {384,384,384}; + + // Define periodicity of the grid + periodicity<3> bc = {NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + double deltaT = 0.025; + + // Diffusion constant for specie U + double du = 2*1e-5; + + // Diffusion constant for specie V + double dv = 1*1e-5; + +#ifdef TEST_RUN + // Number of timesteps + size_t timeSteps = 300; +#else + // Number of timesteps + size_t timeSteps = 50000; +#endif + + // K and F (Physical constant in the equation) + double K = 0.053; + double F = 0.014; + + grid_sm<3,void> gv({3,1,1}); + + sgrid_type grid(sz,domain,g,bc,0,gv); + + grid.template setBackgroundValue<0>(-0.5); + grid.template setBackgroundValue<1>(-0.5); + grid.template setBackgroundValue<2>(-0.5); + grid.template setBackgroundValue<3>(-0.5); + + // spacing of the grid on x and y + double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + draw_oscillation_shock(grid,domain); + + grid.template deviceToHost<U,V>(); + grid.write("Final"); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse_gpu_cs Gray Scott in 3D using sparse grids on gpu in complex geometry + * + * # Full code # {#code} + * + * \include SparseGrid/1_gray_scott_3d_sparse_gpu_cs/main.cu + * + */ +} + +#else + +int main(int argc, char* argv[]) +{ + return 0; +} + +#endif + diff --git a/example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile b/example/SparseGrid/8_filling_benchmark/Makefile similarity index 89% rename from example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile rename to example/SparseGrid/8_filling_benchmark/Makefile index 5bcf5675ed96275648c74c8e616194275dd3b968..94629ac16d26f46dee525a56e3efa9159628326c 100644 --- a/example/SparseGrid/1_gray_scott_3d_sparse_opt/Makefile +++ b/example/SparseGrid/8_filling_benchmark/Makefile @@ -17,7 +17,7 @@ gray_scott_sparse_opt: $(OBJ) all: gray_scott_sparse_opt run: gray_scott_sparse_opt_test - mpirun -np 4 ./gray_scott_sparse_opt + mpirun --oversubscribe -np 4 ./gray_scott_sparse_opt .PHONY: clean all run diff --git a/example/SparseGrid/8_filling_benchmark/config.cfg b/example/SparseGrid/8_filling_benchmark/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..1eecbac3577c765edca7f90cf5f61cfb6b9f4880 --- /dev/null +++ b/example/SparseGrid/8_filling_benchmark/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cpp Makefile diff --git a/example/SparseGrid/8_filling_benchmark/main.cpp b/example/SparseGrid/8_filling_benchmark/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..97e732590e893d51caa25fc2f328c5f01b903f91 --- /dev/null +++ b/example/SparseGrid/8_filling_benchmark/main.cpp @@ -0,0 +1,226 @@ + +#include "util/cuda_launch.hpp" +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU + * + * [TOC] + * + * # Solving a gray scott-system in 3D using sparse grids optimized on CPU # {#e3_gs_gray_scott_opt} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids in an optimized way + * + * In figure is the final solution of the problem + * + * \htmlonly + * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/> + * \endhtmlonly + * + * More or less this example is the adaptation of the dense example in 3D + * + * \see \ref Grid_3_gs_3D + * + * This example is the same as \ref e3_gs_gray_scott_sparse the difference is optimizing for speed. + * + * Two optimization has been done. The first is to change the layout to struct of arrays defining the grid with + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp grid definition + * + * The second is using the function **conv_cross2** to calculate the right-hand-side + * this function can be used to do a convolution that involve points in a cross stencil like in figure that involve + * two properties + * +\verbatim + + * + * + * * x * * + * + * + +\endverbatim + * + * The function accept a lambda function where the first 2 arguments are the output in form of Vc::double_v. If we use float + * we have to use Vc::float_v or Vc::int_v in case the property is an integer. Vc variables come from the Vc library that is + * now integrated in openfpm. + * + *\htmlonly + * <a href="https://github.com/VcDevel/Vc" >Vc Library</a> + *\endhtmlonly + * + * Vc::double_v in general pack 1,2,4 doubles dependently from the fact we choose to activate no-SSE,SSE or AVX at compiler level. + * The arguments 3 and 4 contain the properties of two selected properties in the cross pattern given by xm xp ym yp zm zp. + * The last arguments is instead the mask. The mask can be accessed to check the number of existing points. For example if + * we have a cross stencil in 3D with stencil size = 1 than we expect 6 points. Note that the mask is an array because if Vc::double_v + * contain 4 doubles than the mask has 4 elements accessed with the array operator []. The call **cross_conv2** also accept + * template parameters the first two indicate the source porperties, the other two are the destination properties. While the + * last is the extension of the stencil. In this case we use 1. + * + * The lambda function is defined as + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp lambda + * + * and used in the body loop + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp body + * + * To note that instead of copy we split the properties where we are acting at every iteration + * + */ + +constexpr int U = 0; +constexpr int V = 1; + +constexpr int U_next = 2; +constexpr int V_next = 3; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +void init(sgrid_dist_soa<3,double,aggregate<double,double,double,double> > & grid, Box<3,double> & domain) +{ + for (int i = 0 ; i < 10 ; i++) + { + timer t; + t.start(); + + auto it = grid.getGridIterator(); + + while (it.isNext()) + { + // Get the local grid key + auto key = it.get_dist(); + + // Old values U and V + grid.template insert<U>(key) = 1.0; + + ++it; + } + + t.stop(); + std::cout << "Time populate: " << t.getwct() << std::endl; + + grid.clear(); + + timer t2; + t2.start(); + + auto it2 = grid.getGridIterator(); + + while (it2.isNext()) + { + // Get the local grid key + auto key = it2.get_dist(); + + // Old values U and V + grid.template insert<U>(key) = 5.0; + + ++it2; + } + + t2.stop(); + std::cout << "Time populate: " << t2.getwct() << std::endl; + + } +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,double> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {256,256,256}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + double deltaT = 0.25; + + // Diffusion constant for specie U + double du = 2*1e-5; + + // Diffusion constant for specie V + double dv = 1*1e-5; + + // Number of timesteps +#ifdef TEST_RUN + size_t timeSteps = 200; +#else + size_t timeSteps = 5000; +#endif + + // K and F (Physical constant in the equation) + double K = 0.053; + double F = 0.014; + + //! \cond [grid definition] \endcond + + sgrid_dist_soa<3, double, aggregate<double,double,double,double>> grid(sz,domain,g,bc); + + //! \cond [grid definition] \endcond + + // spacing of the grid on x and y + double spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain); + + // sync the ghost + size_t count = 0; + grid.template ghost_get<U,V>(); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + double uFactor = deltaT * du/(spacing[x]*spacing[x]); + double vFactor = deltaT * dv/(spacing[x]*spacing[x]); + + timer tot_sim; + tot_sim.start(); + + auto & v_cl = create_vcluster(); + + tot_sim.stop(); + std::cout << "Total simulation: " << tot_sim.getwct() << std::endl; + + grid.write("final"); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse_opt Gray Scott in 3D using sparse grids optimized on CPU + * + * # Full code # {#code} + * + * \include SparseGrid/1_gray_scott_3d_sparse_opt/main.cpp + * + */ +} diff --git a/example/SparseGrid/8_filling_benchmark_gpu/Makefile b/example/SparseGrid/8_filling_benchmark_gpu/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c6a4b43bbd6b2846da27e6f3b042aa403d726697 --- /dev/null +++ b/example/SparseGrid/8_filling_benchmark_gpu/Makefile @@ -0,0 +1,51 @@ +include ../../example.mk + +### internally the example disable with the preprocessor its code if not compiled with nvcc +CUDA_CC= +CUDA_CC_LINK= +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) +else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif + LIBS_SELECT=$(LIBS) +endif + + +gray_scott_sparse_gpu_test: OPT += -DTEST_RUN +gray_scott_sparse_gpu_test: gray_scott_sparse_gpu + +CC=mpic++ + +LDIR = + +OBJ = main.o + +%.o: %.cu + $(CUDA_CC) -use_fast_math -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +gray_scott_sparse_gpu: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT) + +all: gray_scott_sparse_gpu + +run: gray_scott_sparse_gpu_test + mpirun --oversubscribe -np 4 ./gray_scott_sparse_gpu + +.PHONY: clean all run + +clean: + rm -f *.o *~ core gray_scott_sparse_gpu + diff --git a/example/SparseGrid/8_filling_benchmark_gpu/config.cfg b/example/SparseGrid/8_filling_benchmark_gpu/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/SparseGrid/8_filling_benchmark_gpu/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/SparseGrid/8_filling_benchmark_gpu/main.cu b/example/SparseGrid/8_filling_benchmark_gpu/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..644a4cc1a39c7429e99220915d66091f0e848118 --- /dev/null +++ b/example/SparseGrid/8_filling_benchmark_gpu/main.cu @@ -0,0 +1,221 @@ +#define VCLUSTER_PERF_REPORT +#define SYNC_BEFORE_TAKE_TIME +#define ENABLE_GRID_DIST_ID_PERF_STATS +#include "Decomposition/Distribution/BoxDistribution.hpp" +#include "util/cuda_launch.hpp" +#include "Grid/grid_dist_id.hpp" +#include "data_type/aggregate.hpp" +#include "timer.hpp" + +/*! + * + * \page Grid_3_gs_3D_sparse_gpu Gray Scott in 3D using sparse grids on GPU + * + * [TOC] + * + * # Solving a gray scott-system in 3D using Sparse grids on gpu # {#e3_gs_gray_scott_gpu} + * + * This example show how to solve a Gray-Scott system in 3D using sparse grids on gpu + * + * In figure is the final solution of the problem + * + * \htmlonly + * <img src="http://ppmcore.mpi-cbg.de/web/images/examples/gray_scott_3d/gs_alpha.png"/> + * \endhtmlonly + * + * More or less this example is the adaptation of the dense example in 3D + * + * \see \ref Grid_3_gs_3D + * + * # Initializetion + * + * On gpu we can add points using the function addPoints this function take 2 lamda functions the first take 3 arguments (in 3D) + * i,j,k these are the global coordinates for a point. We can return either true either false. In case of true the point is + * created in case of false the point is not inserted. The second lamda is instead used to initialize the point inserted. + * The arguments of the second lambda are the data argument we use to initialize the point and the global coordinates i,j,k + * + * After we add the points we have to flush the added points. This us achieved using the function flush the template parameters + * indicate how we have to act on the points. Consider infact we are adding points already exist ... do we have to add it using the max + * or the min. **FLUSH_ON_DEVICE** say instead that the operation is performed using the GPU + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points + * + * The function can also called with a specified range + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu create points sub + * + * # Update + * + * to calculate the right-hand-side we use the function **conv2** this function can be used to do a convolution that involve + * two properties + * + * The function accept a lambda function where the first 2 arguments are the output of the same type of the two property choosen. + * + * The arguments 3 and 4 contain the properties of two selected properties. while i,j,k are the coordinates we have to calculate the + * convolution. The call **conv2** also accept template parameters the first two indicate the source porperties, the other two are the destination properties. While the + * last is the extension of the stencil. In this case we use 1. + * + * The lambda function is defined as + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu lambda + * + * and used in the body loop + * + * \snippet SparseGrid/1_gray_scott_3d_sparse_gpu/main.cu body + * + */ + +#ifdef __NVCC__ + +constexpr int U = 0; +constexpr int V = 1; + +constexpr int U_next = 2; +constexpr int V_next = 3; + +constexpr int x = 0; +constexpr int y = 1; +constexpr int z = 2; + +typedef CartDecomposition<3,float, CudaMemory, memory_traits_inte, BoxDistribution<3,float> > Dec; + +typedef sgrid_dist_id_gpu<3,float,aggregate<float>,CudaMemory, Dec> SparseGridType; + +void init(SparseGridType & grid, Box<3,float> & domain) +{ + //! \cond [create points] \endcond + + typedef typename GetAddBlockType<SparseGridType>::type InsertBlockT; + + for (int i = 0 ; i < 10 ; i++) + { + timer t; + t.start(); + + grid.addPoints([] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 1.0; + } + ); + + + grid.template flush<smax_<U>>(flush_type::FLUSH_ON_DEVICE); + + t.stop(); + + std::cout << "Time populate: " << t.getwct() << std::endl; + + timer t2; + cudaDeviceSynchronize(); + t2.start(); + + grid.addPoints([] __device__ (int i, int j, int k) + { + return true; + }, + [] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<U>() = 5.0; + } + ); + + + grid.template flush<sRight_<U>>(flush_type::FLUSH_ON_DEVICE); + + t2.stop(); + + std::cout << "Time populate: " << t2.getwct() << std::endl; + } +} + + +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + // domain + Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {512,512,512}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + float deltaT = 0.25; + + // Diffusion constant for specie U + float du = 2*1e-5; + + // Diffusion constant for specie V + float dv = 1*1e-5; + + // Number of timesteps +#ifdef TEST_RUN + size_t timeSteps = 300; +#else + size_t timeSteps = 15000; +#endif + + // K and F (Physical constant in the equation) + float K = 0.053; + float F = 0.014; + + SparseGridType grid(sz,domain,g,bc); + + // spacing of the grid on x and y + float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + + init(grid,domain); + + // sync the ghost + grid.deviceToHost<U>(); + grid.write("final"); + grid.print_stats(); + + //! \cond [time stepping] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * ## Finalize ## + * + * Deinitialize the library + * + * \snippet Grid/3_gray_scott/main.cpp finalize + * + */ + + //! \cond [finalize] \endcond + + openfpm_finalize(); + + //! \cond [finalize] \endcond + + /*! + * \page Grid_3_gs_3D_sparse Gray Scott in 3D + * + * # Full code # {#code} + * + * \include Grid/3_gray_scott_3d/main.cpp + * + */ +} + +#else + +int main(int argc, char* argv[]) +{ + return 0; +} + +#endif + diff --git a/example/VCluster/0_simple/Makefile b/example/VCluster/0_simple/Makefile index 721c8d9f7e9f12f7260d5883ef719e5f8a02305c..66fa2b881f2f1fbc6f76897c138b1414dcaed675 100644 --- a/example/VCluster/0_simple/Makefile +++ b/example/VCluster/0_simple/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) vcluster: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ vcluster: $(OBJ) all: vcluster run: all - mpirun -np 3 ./vcluster + mpirun --oversubscribe -np 3 ./vcluster .PHONY: clean all run diff --git a/example/VCluster/1_semantic/Makefile b/example/VCluster/1_semantic/Makefile index 125663e271cdeff1e9819dcaa3b9bb16b157d289..3a21e758b02baf30977fd2af27d232834ac8a5e0 100644 --- a/example/VCluster/1_semantic/Makefile +++ b/example/VCluster/1_semantic/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) vcluster: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ vcluster: $(OBJ) all: vcluster run: all - mpirun -np 3 ./vcluster + mpirun --oversubscribe -np 3 ./vcluster .PHONY: clean all run diff --git a/example/VCluster/2_serial_and_parallel/Makefile b/example/VCluster/2_serial_and_parallel/Makefile index 455dced4b3fc5888f5ac1aff2e0389d047b69f83..a5cdc3509433cff895e22e7d051b4f516d1e21cc 100644 --- a/example/VCluster/2_serial_and_parallel/Makefile +++ b/example/VCluster/2_serial_and_parallel/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) serial_parallel: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ serial_parallel: $(OBJ) all: serial_parallel run: all - mpirun -np 3 ./serial_parallel + mpirun --oversubscribe -np 3 ./serial_parallel .PHONY: clean all run diff --git a/example/Vector/0_simple/CMakeLists.txt b/example/Vector/0_simple/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..05aa33049234ff83ffe0ed8255bfc81c83787a27 --- /dev/null +++ b/example/Vector/0_simple/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +project(openfpm_pdata LANGUAGES C CXX) + +set(CMAKE_SKIP_BUILD_RPATH TRUE) ###### <--------- This is absolutely necessary if you use linear algebra + +find_package(openfpm 3.2.0 REQUIRED) +find_package(Threads) +find_package(MPI) + +if (openfpm_FOUND) + message("OpenFPM found") + + add_executable(vect main.cpp) + target_link_libraries(vect PUBLIC openfpm::binary_config) + + # or + + #target_include_directories(vect PUBLIC ${OPENFPM_INCLUDES}) + #target_compile_definitions(vect PUBLIC ${OPENFPM_DEFINITION}) + #target_link_libraries(vect PUBLIC ${OPENFPM_LIBS}) +endif() + diff --git a/example/Vector/0_simple/Makefile b/example/Vector/0_simple/Makefile index 4c6ce566bc1c60d24c3052a03da29571462db213..f5b5d7c2ba73e2f78428594820275e484e23779e 100644 --- a/example/Vector/0_simple/Makefile +++ b/example/Vector/0_simple/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O0 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O0 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) vect: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,10 @@ vect: $(OBJ) all: vect run: all - mpirun -np 2 ./vect + mpirun --oversubscribe -np 2 ./vect + +debug: + ../../../gdbgui/launch_mpi_debugger 2 ./vect .PHONY: clean all run diff --git a/example/Vector/0_simple/config.cfg b/example/Vector/0_simple/config.cfg index 1eecbac3577c765edca7f90cf5f61cfb6b9f4880..db64a949051852e886e7d45036bd501eb5334e00 100644 --- a/example/Vector/0_simple/config.cfg +++ b/example/Vector/0_simple/config.cfg @@ -1,2 +1,2 @@ [pack] -files = main.cpp Makefile +files = main.cpp Makefile CMakeLists.txt diff --git a/example/Vector/1_HDF5_save_load/Makefile b/example/Vector/1_HDF5_save_load/Makefile index 703912dfbb69414b088bbaa7fa532977f2ba03c5..d6be8ec7a0cd7f75e4fc16d2d27171653c077bb0 100644 --- a/example/Vector/1_HDF5_save_load/Makefile +++ b/example/Vector/1_HDF5_save_load/Makefile @@ -11,14 +11,14 @@ all: hdf5 %.o: %.cpp - $(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) hdf5: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: hdf5 - mpirun -np 2 ./hdf5 + mpirun --oversubscribe -np 2 ./hdf5 .PHONY: clean all run diff --git a/example/Vector/1_celllist/Makefile b/example/Vector/1_celllist/Makefile index c9a76a23b45b50e073b571f48fa3653a60cdbc13..4a314c419846bb9e89c935162497be42abbd179b 100644 --- a/example/Vector/1_celllist/Makefile +++ b/example/Vector/1_celllist/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) cell: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ cell: $(OBJ) all: cell run: all - mpirun -np 2 ./cell + mpirun --oversubscribe -np 2 ./cell .PHONY: clean all run diff --git a/example/Vector/1_ghost_get_put/Makefile b/example/Vector/1_ghost_get_put/Makefile index d0683987f533b794fc86798104339b2ee6c58802..b3e3619da3c4c69aad0a845b8ef970ddea69414d 100644 --- a/example/Vector/1_ghost_get_put/Makefile +++ b/example/Vector/1_ghost_get_put/Makefile @@ -15,7 +15,7 @@ ghost: $(OBJ) all: ghost run: all - mpirun -np 2 ./ghost + mpirun --oversubscribe -np 2 ./ghost .PHONY: clean all run diff --git a/example/Vector/1_gpu_first_step/Makefile b/example/Vector/1_gpu_first_step/Makefile index a2560310b0853a99448ab9231748e5cf6ba61abf..5234dece565c283ef1a09a3cf15d659bfb5ae508 100644 --- a/example/Vector/1_gpu_first_step/Makefile +++ b/example/Vector/1_gpu_first_step/Makefile @@ -1,33 +1,48 @@ include ../../example.mk +LIBS_CUDA_ON_CPU=$(LIBS) CUDA_CC= -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) - INCLUDE_PATH_NVCC= +CC=mpic++ +ifdef HIP + CUDA_CC=hipcc + CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + LIBS_SELECT=$(LIBS) + CC=hipcc else - CUDA_CC=nvcc -ccbin=mpic++ + ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) + else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + else + CUDA_CC=nvcc -ccbin=mpic++ + endif + LIBS_SELECT=$(LIBS) + endif endif -CC=mpic++ - OBJ = main.o gpu_fstep: %.o: %.cu - $(CUDA_CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CUDA_CC) -O3 -g $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp $(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) gpu_fstep: $(OBJ) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) all: gpu_fstep run: gpu_fstep - mpirun -np 2 ./gpu_fstep + mpirun --oversubscribe -np 2 ./gpu_fstep .PHONY: clean all run diff --git a/example/Vector/1_gpu_first_step/main.cu b/example/Vector/1_gpu_first_step/main.cu index 0bb965e8022f7bed4fedc5985b2a84603af0e8dd..30a69fcde617fddf607945c0c0fcce5e786baec9 100644 --- a/example/Vector/1_gpu_first_step/main.cu +++ b/example/Vector/1_gpu_first_step/main.cu @@ -99,6 +99,21 @@ * \snippet Vector/1_gpu_first_step/main.cu using_openmpi * * * MPI must be compiled with CUDA support (in general installing OpenFPM with -g should attempt to install OpenMPI with CUDA support) + * + * ## Macro CUDA_LAUNCH + * + * When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax + * + * my_kernel<<<wthr,thr>>>(arguments ... ) + * + * Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel. + * Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where + * ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1 + * all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility + * to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g) or with CUDA_ON_CPU support + * (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this + * feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support + * AMD GPUs as well using this system. * * ## Full code ## {#code_e0_sim} * @@ -211,7 +226,8 @@ int main(int argc, char* argv[]) //! \cond [launch_domain_it] \endcond auto ite = vd.getDomainIteratorGPU(); - translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel()); + // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel()); + CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel()); //! \cond [launch_domain_it] \endcond @@ -230,7 +246,8 @@ int main(int argc, char* argv[]) for (int j = 0 ; j < 100 ; j++) { auto ite = vd.getDomainIteratorGPU(); - translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel()); + // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel()); + CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel()); vd.map(RUN_ON_DEVICE); vd.template ghost_get<0,1,2>(RUN_ON_DEVICE); diff --git a/example/Vector/2_expressions/Makefile b/example/Vector/2_expressions/Makefile index 7ab3093eedd5282dcf9269bd222005e21e2bf357..bffde476e912b72ad60e9472817b2489c9af5c8e 100644 --- a/example/Vector/2_expressions/Makefile +++ b/example/Vector/2_expressions/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) expr: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ expr: $(OBJ) all: expr run: all - mpirun -np 3 ./expr + mpirun --oversubscribe -np 3 ./expr .PHONY: clean all run diff --git a/example/Vector/3_molecular_dynamic/Makefile b/example/Vector/3_molecular_dynamic/Makefile index fab9a79d7e8c2b60a6ff5772b6318d2a9b37da0b..e4baa10c28a3bf9b83eef852ee833b6605cdd918 100644 --- a/example/Vector/3_molecular_dynamic/Makefile +++ b/example/Vector/3_molecular_dynamic/Makefile @@ -12,7 +12,7 @@ OBJ_EXPR_PAP = main_expr_paper.o all: md_dyn md_dyn_expr md_dyn_vl md_dyn_pap %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) md_dyn: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -27,7 +27,7 @@ md_dyn_pap: $(OBJ_EXPR_PAP) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: all - mpirun -np 3 ./md_dyn && mpirun -np 3 ./md_dyn_expr && mpirun -np 3 ./md_dyn_vl; + mpirun --oversubscribe -np 3 ./md_dyn && mpirun --oversubscribe -np 3 ./md_dyn_expr && mpirun --oversubscribe -np 3 ./md_dyn_vl; .PHONY: clean all run diff --git a/example/Vector/3_molecular_dynamic_gpu/Makefile b/example/Vector/3_molecular_dynamic_gpu/Makefile index 614460f3072c9d4f7797f3645029337d2a4078f5..43a60ef146d87adb9aa889aa82aca5bf64105c18 100644 --- a/example/Vector/3_molecular_dynamic_gpu/Makefile +++ b/example/Vector/3_molecular_dynamic_gpu/Makefile @@ -4,13 +4,21 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + endif + LIBS_SELECT=$(LIBS) endif CC=mpic++ @@ -29,11 +37,11 @@ md_dyn_test: md_dyn $(CUDA_CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) md_dyn: $(OBJ) - $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS) + $(CUDA_CC_LINK) -o $@ $^ $(LIBS_PATH) $(LIBS_SELECT) run: md_dyn_test - mpirun -np 3 ./md_dyn; + mpirun --oversubscribe -np 3 ./md_dyn; .PHONY: clean all run diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/Makefile b/example/Vector/3_molecular_dynamic_gpu_opt/Makefile index b34bd0f1fc9f8980106878c106dd6dfdb07be105..9bece43f338db1de8faa0d1bbf058883c6ade48c 100644 --- a/example/Vector/3_molecular_dynamic_gpu_opt/Makefile +++ b/example/Vector/3_molecular_dynamic_gpu_opt/Makefile @@ -1,13 +1,23 @@ include ../../example.mk CUDA_CC= -ifeq (, $(shell which nvcc)) + +ifdef CUDA_ON_CPU CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ else - CUDA_CC=nvcc -ccbin=mpic++ + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + endif endif + ifeq ($(PROFILE),ON) CC=scorep --nocompiler --cuda --mpp=mpi nvcc CC_MPI=mpic++ @@ -29,10 +39,10 @@ md_dyn_gpu_test: OPT += -DTEST_RUN md_dyn_gpu_test: all %.o: %.cu - $(CC) $(OPT) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CC) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp - $(CC_MPI) $(OPT) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC_MPI) $(OPT) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) md_dyn_gpu: $(OBJ_GPU) $(CC_MPI) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -44,7 +54,7 @@ md_dyn_cpu_best: $(OBJ_CPU_BEST) $(CC_MPI) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: md_dyn_gpu_test - mpirun -np 3 ./md_dyn_gpu && mpirun -np 3 ./md_dyn_cpu && mpirun -np 3 ./md_dyn_cpu_best; + mpirun --oversubscribe -np 3 ./md_dyn_gpu && mpirun --oversubscribe -np 3 ./md_dyn_cpu && mpirun --oversubscribe -np 3 ./md_dyn_cpu_best; .PHONY: clean all run diff --git a/example/Vector/4_complex_prop/Makefile b/example/Vector/4_complex_prop/Makefile index 2de12f95f6b02c804e5fa5a0b835d5c78fa9337d..3b8ea179ce3584e6fd2dd710f7e709335864cba2 100644 --- a/example/Vector/4_complex_prop/Makefile +++ b/example/Vector/4_complex_prop/Makefile @@ -10,7 +10,7 @@ OBJ_SER = main_ser.o all: vect_cp vect_ser %.o: %.cpp - $(CC) -O3 -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -c --std=c++14 -o $@ $< $(INCLUDE_PATH) vect_cp: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -19,7 +19,7 @@ vect_ser: $(OBJ_SER) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: all - mpirun -np 2 ./vect_cp && mpirun -np 2 ./vect_ser + mpirun --oversubscribe -np 2 ./vect_cp && mpirun --oversubscribe -np 2 ./vect_ser .PHONY: clean all run diff --git a/example/Vector/4_multiphase_celllist_verlet/Makefile b/example/Vector/4_multiphase_celllist_verlet/Makefile index e14eedbb41b3cc4cd79451b34eebcc209de98f2a..502002fc388e046c14050d584cf8810cde53b38c 100644 --- a/example/Vector/4_multiphase_celllist_verlet/Makefile +++ b/example/Vector/4_multiphase_celllist_verlet/Makefile @@ -7,7 +7,7 @@ LDIR = OBJ = main.o %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) multip: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -15,7 +15,7 @@ multip: $(OBJ) all: multip run: all - mpirun -np 2 ./multip + mpirun --oversubscribe -np 2 ./multip .PHONY: clean all run diff --git a/example/Vector/4_reorder/Makefile b/example/Vector/4_reorder/Makefile index 2569b0b64d49609a916adb2b92b36a3d9c566461..e58f415bedc65a21c776e85b4e0365fbb8ed87dd 100644 --- a/example/Vector/4_reorder/Makefile +++ b/example/Vector/4_reorder/Makefile @@ -13,7 +13,7 @@ all_test: OPT += -DTEST_RUN all_test: md_data_ord_test md_comp_ord_test %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH) md_data_ord: $(OBJ_DORD) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -28,7 +28,7 @@ md_comp_ord_test: $(OBJ_CORD) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: all_test - mpirun -np 4 ./md_data_ord_test && mpirun -np 4 ./md_comp_ord_test + mpirun --oversubscribe -np 4 ./md_data_ord_test && mpirun --oversubscribe -np 4 ./md_comp_ord_test .PHONY: clean all run all_test on_test diff --git a/example/Vector/5_molecular_dynamic_sym/Makefile b/example/Vector/5_molecular_dynamic_sym/Makefile index cf0391fab74d27ace055d5904d8df2c180b2266d..80bef9b502d62d355b77f7d0850aab280949aae8 100644 --- a/example/Vector/5_molecular_dynamic_sym/Makefile +++ b/example/Vector/5_molecular_dynamic_sym/Makefile @@ -9,13 +9,13 @@ OBJ_DORD = main.o all: md_sym %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH) md_sym: $(OBJ_DORD) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: md_sym - mpirun -np 3 ./md_sym + mpirun --oversubscribe -np 3 ./md_sym .PHONY: clean all run diff --git a/example/Vector/5_molecular_dynamic_sym_crs/Makefile b/example/Vector/5_molecular_dynamic_sym_crs/Makefile index cf0391fab74d27ace055d5904d8df2c180b2266d..80bef9b502d62d355b77f7d0850aab280949aae8 100644 --- a/example/Vector/5_molecular_dynamic_sym_crs/Makefile +++ b/example/Vector/5_molecular_dynamic_sym_crs/Makefile @@ -9,13 +9,13 @@ OBJ_DORD = main.o all: md_sym %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH) md_sym: $(OBJ_DORD) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: md_sym - mpirun -np 3 ./md_sym + mpirun --oversubscribe -np 3 ./md_sym .PHONY: clean all run diff --git a/example/Vector/6_complex_usage/Makefile b/example/Vector/6_complex_usage/Makefile index 31385b0c5b98849b22cab8969db58365498defc6..a1fb90fd199e06caff755ef62c8184e9bc25d551 100644 --- a/example/Vector/6_complex_usage/Makefile +++ b/example/Vector/6_complex_usage/Makefile @@ -9,13 +9,13 @@ OBJ_DORD = main.o all: complex_use %.o: %.cpp - $(CC) -O3 -g -c --std=c++11 $(OPT) -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g -c --std=c++14 $(OPT) -o $@ $< $(INCLUDE_PATH) complex_use: $(OBJ_DORD) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: all - mpirun -np 3 ./complex_use + mpirun --oversubscribe -np 3 ./complex_use .PHONY: clean all run all_test on_test diff --git a/example/Vector/7_SPH_dlb/CMakeLists.txt b/example/Vector/7_SPH_dlb/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..286edddbc4f5e2df68949c96dc6bb6c4a360ba3c --- /dev/null +++ b/example/Vector/7_SPH_dlb/CMakeLists.txt @@ -0,0 +1,23 @@ +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +project(openfpm_pdata LANGUAGES C CXX) + +set(CMAKE_SKIP_BUILD_RPATH TRUE) ###### <--------- This is absolutely necessary if you use linear algebra + +find_package(openfpm 3.2.0 REQUIRED) +find_package(Threads) +find_package(MPI) + +if (openfpm_FOUND) + message("OpenFPM found") + + add_executable(sph_dlb main.cpp) + target_link_libraries(sph_dlb PUBLIC openfpm::binary_config) + + # or + + #target_include_directories(sph_dlb PUBLIC ${OPENFPM_INCLUDES}) + #target_compile_definitions(sph_dlb PUBLIC ${OPENFPM_DEFINITION}) + #target_link_libraries(sph_dlb PUBLIC ${OPENFPM_LIBS}) + #target_compile_options(sph_dlb PUBLIC ${OPENFPM_COMPILE_OPTIONS}) +endif() + diff --git a/example/Vector/7_SPH_dlb/Makefile b/example/Vector/7_SPH_dlb/Makefile index 0fc85bdd4ed108063c48e4fe7ccb8bd12039be95..a039eef590d8a60363ec39ef2b7663f5d61579de 100644 --- a/example/Vector/7_SPH_dlb/Makefile +++ b/example/Vector/7_SPH_dlb/Makefile @@ -12,7 +12,7 @@ sph_dlb_test: OPT += -DTEST_RUN sph_dlb_test: sph_dlb %.o: %.cpp - $(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 -g $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) sph_dlb: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -20,7 +20,7 @@ sph_dlb: $(OBJ) all: sph_dlb run: sph_dlb_test - mpirun -np 2 ./sph_dlb + mpirun --oversubscribe -np 2 ./sph_dlb .PHONY: clean all run diff --git a/example/Vector/7_SPH_dlb/config.cfg b/example/Vector/7_SPH_dlb/config.cfg index 1eecbac3577c765edca7f90cf5f61cfb6b9f4880..db64a949051852e886e7d45036bd501eb5334e00 100644 --- a/example/Vector/7_SPH_dlb/config.cfg +++ b/example/Vector/7_SPH_dlb/config.cfg @@ -1,2 +1,2 @@ [pack] -files = main.cpp Makefile +files = main.cpp Makefile CMakeLists.txt diff --git a/example/Vector/7_SPH_dlb_gpu/CMakeLists.txt b/example/Vector/7_SPH_dlb_gpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..343f24fd0612e8cafdbb80d4acfc35d718d1e33e --- /dev/null +++ b/example/Vector/7_SPH_dlb_gpu/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +project(openfpm_pdata LANGUAGES C CXX) + +set(CMAKE_SKIP_BUILD_RPATH TRUE) ###### <--------- This is absolutely necessary if you use linear algebra + +find_package(openfpm 3.2.0 REQUIRED) +find_package(Threads) + +enable_language(CUDA) + +if (openfpm_FOUND) + message("OpenFPM found") + + add_executable(sph_dlb main.cu) + target_link_libraries(sph_dlb PUBLIC openfpm::binary_config) + + #or + + #target_include_directories(sph_dlb PUBLIC ${OPENFPM_INCLUDES}) + #target_compile_definitions(sph_dlb PUBLIC ${OPENFPM_DEFINITION}) + #target_link_libraries(sph_dlb PUBLIC ${OPENFPM_LIBS}) + #target_compile_options(sph_dlb PUBLIC ${OPENFPM_COMPILE_OPTIONS}) +endif() + diff --git a/example/Vector/7_SPH_dlb_gpu/Makefile b/example/Vector/7_SPH_dlb_gpu/Makefile index 059eb412ccfa0246aea7c770c73e582916f332ad..8813b2db0ecd3b2b8764579fea94295107af65ea 100644 --- a/example/Vector/7_SPH_dlb_gpu/Makefile +++ b/example/Vector/7_SPH_dlb_gpu/Makefile @@ -4,13 +4,32 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) - INCLUDE_PATH_NVCC= - CUDA_CC_LINK=mpic++ + +CC=mpic++ +ifdef HIP + CUDA_CC=hipcc + CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + LIBS_SELECT=$(LIBS) + CC=hipcc + CUDA_CC_LINK=hipcc else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ + ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) + else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + endif + LIBS_SELECT=$(LIBS) + endif endif CC=mpic++ @@ -25,18 +44,18 @@ sph_dlb_test: OPT += -DTEST_RUN sph_dlb_test: sph_dlb %.o: %.cu - $(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CUDA_CC) $(CUDA_OPTIONS) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) sph_dlb: $(OBJ) - $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) all: sph_dlb run: sph_dlb_test - mpirun -np 2 ./sph_dlb + mpirun --oversubscribe -np 2 ./sph_dlb .PHONY: clean all run diff --git a/example/Vector/7_SPH_dlb_gpu/config.cfg b/example/Vector/7_SPH_dlb_gpu/config.cfg index 699be429e147cd40187be6ce345ef2f060f59fbc..be08dcf5eba6ca028a142535d42b254cba7a353b 100644 --- a/example/Vector/7_SPH_dlb_gpu/config.cfg +++ b/example/Vector/7_SPH_dlb_gpu/config.cfg @@ -1,2 +1,2 @@ [pack] -files = main.cu Makefile +files = main.cu Makefile CMakeLists.txt diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu index 08d91fbe2e6f5e0492b96b04a7f6a7efb875d3a0..701a0623c3b601afcb77a7bdf5e5987367b8d9b8 100644 --- a/example/Vector/7_SPH_dlb_gpu/main.cu +++ b/example/Vector/7_SPH_dlb_gpu/main.cu @@ -39,6 +39,21 @@ * * \snippet Vector/7_SPH_dlb_gpu/main.cu mark_to_remove_kernel * + * ## Macro CUDA_LAUNCH + * + * When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax + * + * my_kernel<<<wthr,thr>>>(arguments ... ) + * + * Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel. + * Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where + * ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1 + * all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility + * to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g) or with CUDA_ON_CPU support + * (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this + * feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support + * AMD GPUs as well using this system. + * * \include Vector/7_SPH_dlb_gpu_opt/main.cu * */ @@ -195,7 +210,10 @@ inline void EqState(particles & vd) { auto it = vd.getDomainIteratorGPU(); - EqState_gpu<<<it.wthr,it.thr>>>(vd.toKernel(),B); + // You can use standard CUDA kernel launch or the macro CUDA_LAUNCH + + //EqState_gpuning<<<it.wthr,it.thr>>>(vd.toKernel(),B); + CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B) } @@ -301,16 +319,16 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap Point<3,real_number> xa = vd.getPos(a); // Take the mass of the particle dependently if it is FLUID or BOUNDARY - real_number massa = (vd.getProp<type>(a) == FLUID)?MassFluid:MassBound; + real_number massa = (vd.template getProp<type>(a) == FLUID)?MassFluid:MassBound; // Get the density of the of the particle a - real_number rhoa = vd.getProp<rho>(a); + real_number rhoa = vd.template getProp<rho>(a); // Get the pressure of the particle a - real_number Pa = vd.getProp<Pressure>(a); + real_number Pa = vd.template getProp<Pressure>(a); // Get the Velocity of the particle a - Point<3,real_number> va = vd.getProp<velocity>(a); + Point<3,real_number> va = vd.template getProp<velocity>(a); // Reset the force counter (- gravity on zeta direction) vd.template getProp<force>(a)[0] = 0.0; @@ -319,7 +337,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap vd.template getProp<drho>(a) = 0.0; // We threat FLUID particle differently from BOUNDARY PARTICLES ... - if (vd.getProp<type>(a) != FLUID) + if (vd.template getProp<type>(a) != FLUID) { // If it is a boundary particle calculate the delta rho based on equation 2 @@ -339,14 +357,14 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap if (a == b) {++Np; continue;}; // get the mass of the particle - real_number massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound; + real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound; // Get the velocity of the particle b - Point<3,real_number> vb = vd.getProp<velocity>(b); + Point<3,real_number> vb = vd.template getProp<velocity>(b); // Get the pressure and density of particle b - real_number Pb = vd.getProp<Pressure>(b); - real_number rhob = vd.getProp<rho>(b); + real_number Pb = vd.template getProp<Pressure>(b); + real_number rhob = vd.template getProp<rho>(b); // Get the distance between p and q Point<3,real_number> dr = xa - xb; @@ -374,7 +392,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap ++Np; } - vd.getProp<red>(a) = max_visc; + vd.template getProp<red>(a) = max_visc; } else { @@ -395,10 +413,10 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap // if (p == q) skip this particle if (a == b) {++Np; continue;}; - real_number massb = (vd.getProp<type>(b) == FLUID)?MassFluid:MassBound; - Point<3,real_number> vb = vd.getProp<velocity>(b); - real_number Pb = vd.getProp<Pressure>(b); - real_number rhob = vd.getProp<rho>(b); + real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound; + Point<3,real_number> vb = vd.template getProp<velocity>(b); + real_number Pb = vd.template getProp<Pressure>(b); + real_number rhob = vd.template getProp<rho>(b); // Get the distance between p and q Point<3,real_number> dr = xa - xb; @@ -415,7 +433,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap Point<3,real_number> DW; DWab(dr,DW,r,false); - real_number factor = - massb*((vd.getProp<Pressure>(a) + vd.getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc)); + real_number factor = - massb*((vd.template getProp<Pressure>(a) + vd.template getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc)); vd.template getProp<force>(a)[0] += factor * DW.get(0); vd.template getProp<force>(a)[1] += factor * DW.get(1); @@ -427,7 +445,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap ++Np; } - vd.getProp<red>(a) = max_visc; + vd.template getProp<red>(a) = max_visc; } } @@ -438,7 +456,8 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N // Update the cell-list vd.updateCellList(NN); - calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar); + //calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar); + CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar) max_visc = reduce_local<red,_max_>(vd); } @@ -448,11 +467,11 @@ __global__ void max_acceleration_and_velocity_gpu(vector_type vd) { auto a = GET_PARTICLE(vd); - Point<3,real_number> acc(vd.getProp<force>(a)); - vd.getProp<red>(a) = norm(acc); + Point<3,real_number> acc(vd.template getProp<force>(a)); + vd.template getProp<red>(a) = norm(acc); - Point<3,real_number> vel(vd.getProp<velocity>(a)); - vd.getProp<red2>(a) = norm(vel); + Point<3,real_number> vel(vd.template getProp<velocity>(a)); + vd.template getProp<red2>(a) = norm(vel); } void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_number & max_vel) @@ -460,7 +479,8 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n // Calculate the maximum acceleration auto part = vd.getDomainIteratorGPU(); - max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel()); + // max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel()); + CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel()); max_acc = reduce_local<red,_max_>(vd); max_vel = reduce_local<red2,_max_>(vd); @@ -566,7 +586,8 @@ void verlet_int(particles & vd, real_number dt) real_number dt205 = dt*dt*0.5; real_number dt2 = dt*2.0; - verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205); + // verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205); + CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205); //! \cond [remove_marked_part] \endcond @@ -646,7 +667,8 @@ void euler_int(particles & vd, real_number dt) real_number dt205 = dt*dt*0.5; - euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205); + // euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205); + CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205); // remove the particles remove_marked<red>(vd); @@ -722,7 +744,8 @@ inline void sensor_pressure(Vector & vd, // if the probe is inside the processor domain if (vd.getDecomposition().isLocal(probes.get(i)) == true) { - sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); + // sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); + CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); // move calculated pressure on press_tmp_.deviceToHost(); @@ -771,7 +794,7 @@ int main(int argc, char* argv[]) // You can ignore all these dp/2.0 is a trick to reach the same initialization // of Dual-SPH that use a different criteria to draw particles - Box<3,real_number> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0}); + Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f}); // return an iterator to the fluid particles to add to vd auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box); @@ -820,12 +843,12 @@ int main(int argc, char* argv[]) } // Recipient - Box<3,real_number> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0}); - Box<3,real_number> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0}); + Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f}); + Box<3,real_number> recipient2({dp,dp,dp},{1.6f-dp/2.0f,0.67f-dp/2.0f,0.4f+dp/2.0f}); - Box<3,real_number> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0}); - Box<3,real_number> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0}); - Box<3,real_number> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45}); + Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f}); + Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f}); + Box<3,real_number> obstacle3({0.9f+dp,0.24f,0.0f},{1.02f,0.36f,0.45f}); openfpm::vector<Box<3,real_number>> holes; holes.add(recipient2); @@ -877,7 +900,7 @@ int main(int argc, char* argv[]) ++obstacle_box; } - + vd.map(); // Now that we fill the vector with particles @@ -886,13 +909,13 @@ int main(int argc, char* argv[]) vd.addComputationCosts(md); vd.getDecomposition().decompose(); vd.map(); - + /////////////////////////// // Ok the initialization is done on CPU on GPU we are doing the main loop, so first we offload all properties on GPU vd.hostToDevicePos(); - vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity>(); + vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity,velocity_prev>(); vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE); diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile b/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile index ef2f64a0929810b20de0696874c7bb0a34bc9c7f..e91b646cd03080625765b240d615eb8eb022403b 100644 --- a/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile +++ b/example/Vector/7_SPH_dlb_gpu_more_opt/Makefile @@ -4,15 +4,22 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ - CUDA_OPTIONS= + CUDA_OPTIONS= else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ - CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif endif ifeq ($(PROFILE),ON) @@ -47,7 +54,7 @@ sph_dlb2: $(OBJ) all: sph_dlb run: sph_dlb_test - mpirun -np 2 ./sph_dlb + mpirun --oversubscribe -np 2 ./sph_dlb .PHONY: clean all run diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu index 2d823d7f4178b32fdf5aa3e3e74896ea73074fb3..4535301905776407eb225b9398c7240a1a1b4b0c 100644 --- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu @@ -53,6 +53,8 @@ #define PRINT_STACKTRACE #define STOP_ON_ERROR #define OPENMPI +#define SCAN_WITH_CUB +#define SORT_WITH_CUB //#define SE_CLASS1 //#define USE_LOW_REGISTER_ITERATOR @@ -277,7 +279,7 @@ inline __device__ __host__ real_number Tensile(real_number r, real_number rhoa, real_number wqq2=qq*qq; real_number wqq3=wqq2*qq; - wab+=a2*(1.0f-1.5f*wqq2+0.75f*wqq3); + wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3); } //-Tensile correction. diff --git a/example/Vector/7_SPH_dlb_gpu_opt/Makefile b/example/Vector/7_SPH_dlb_gpu_opt/Makefile index ef2f64a0929810b20de0696874c7bb0a34bc9c7f..e5b464ed3b5d5bbd477deb304f7cd3d689f06d10 100644 --- a/example/Vector/7_SPH_dlb_gpu_opt/Makefile +++ b/example/Vector/7_SPH_dlb_gpu_opt/Makefile @@ -4,17 +4,35 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) - INCLUDE_PATH_NVCC= - CUDA_CC_LINK=mpic++ - CUDA_OPTIONS= +ifdef HIP + CUDA_CC=hipcc + CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + LIBS_SELECT=$(LIBS) + CC=hipcc + CUDA_CC_LINK=hipcc else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ - CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS_CUDA_ON_CPU) + else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif + LIBS_SELECT=$(LIBS) + endif endif + ifeq ($(PROFILE),ON) CUDA_CC=scorep --nocompiler --cuda --mpp=mpi nvcc -ccbin=mpic++ CUDA_CC_LINK=scorep --nocompiler --cuda --mpp=mpi nvcc -ccbin=mpic++ @@ -39,15 +57,15 @@ sph_dlb_test: sph_dlb $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) sph_dlb: $(OBJ) - $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) sph_dlb2: $(OBJ) - $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) + $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) all: sph_dlb run: sph_dlb_test - mpirun -np 2 ./sph_dlb + mpirun --oversubscribe -np 2 ./sph_dlb .PHONY: clean all run diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu index 17f6f7f3d70a6113a315b34f6e333085798fe91c..8dc98d604da3e2dad8e08321b164a8e93fa10f44 100644 --- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu @@ -45,13 +45,13 @@ //#define SE_CLASS1 //#define USE_LOW_REGISTER_ITERATOR -//#define SCAN_WITH_CUB <------ In case you want to use CUB for scan operations +#define SCAN_WITH_CUB //<------ In case you want to use CUB for scan operations +#define SORT_WITH_CUB //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default #include "Vector/vector_dist.hpp" #include <math.h> #include "Draw/DrawParticles.hpp" -#include <cuda_profiler_api.h> @@ -250,6 +250,7 @@ inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_num DW.get(2) = factor * dx.get(2); } + // Tensile correction inline __device__ __host__ real_number Tensile(real_number r, real_number rhoa, real_number rhob, real_number prs1, real_number prs2, real_number W_dap) { @@ -268,7 +269,7 @@ inline __device__ __host__ real_number Tensile(real_number r, real_number rhoa, real_number wqq2=qq*qq; real_number wqq3=wqq2*qq; - wab+=a2*(1.0f-1.5f*wqq2+0.75f*wqq3); + wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3); } //-Tensile correction. @@ -313,19 +314,19 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap Point<3,real_number> xa = vd.getPos(a); // Type of the particle - unsigned int typea = vd.getProp<type>(a); + unsigned int typea = vd.template getProp<type>(a); // Take the mass of the particle dependently if it is FLUID or BOUNDARY //real_number massa = (typea == FLUID)?MassFluid:MassBound; // Get the density of the of the particle a - real_number rhoa = vd.getProp<rho>(a); + real_number rhoa = vd.template getProp<rho>(a); // Get the pressure of the particle a - real_number Pa = vd.getProp<Pressure>(a); + real_number Pa = vd.template getProp<Pressure>(a); // Get the Velocity of the particle a - Point<3,real_number> va = vd.getProp<velocity>(a); + Point<3,real_number> va = vd.template getProp<velocity>(a); Point<3,real_number> force_; force_.get(0) = 0.0f; @@ -348,12 +349,12 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap // if (p == q) skip this particle this condition should be done in the r^2 = 0 if (a == b) {++Np; continue;}; - unsigned int typeb = vd.getProp<type>(b); + unsigned int typeb = vd.template getProp<type>(b); - real_number massb = (typeb == FLUID)?MassFluid:MassBound; - Point<3,real_number> vb = vd.getProp<velocity>(b); - real_number Pb = vd.getProp<Pressure>(b); - real_number rhob = vd.getProp<rho>(b); + real_number massb = (typeb == FLUID)?MassFluid:MassBound; + Point<3,real_number> vb = vd.template getProp<velocity>(b); + real_number Pb = vd.template getProp<Pressure>(b); + real_number rhob = vd.template getProp<rho>(b); // Get the distance between p and q Point<3,real_number> dr = xa - xb; @@ -387,7 +388,7 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap ++Np; } - vd.getProp<red>(a) = max_visc; + vd.template getProp<red>(a) = max_visc; vd.template getProp<force>(a)[0] = force_.get(0); vd.template getProp<force>(a)[1] = force_.get(1); @@ -414,11 +415,11 @@ __global__ void max_acceleration_and_velocity_gpu(vector_type vd) { auto a = GET_PARTICLE(vd); - Point<3,real_number> acc(vd.getProp<force>(a)); - vd.getProp<red>(a) = norm(acc); + Point<3,real_number> acc(vd.template getProp<force>(a)); + vd.template getProp<red>(a) = norm(acc); - Point<3,real_number> vel(vd.getProp<velocity>(a)); - vd.getProp<red2>(a) = norm(vel); + Point<3,real_number> vel(vd.template getProp<velocity>(a)); + vd.template getProp<red2>(a) = norm(vel); } void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_number & max_vel) @@ -426,7 +427,7 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n // Calculate the maximum acceleration auto part = vd.getDomainIteratorGPU(); - max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel()); + CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel()); max_acc = reduce_local<red,_max_>(vd); max_vel = reduce_local<red2,_max_>(vd); @@ -529,7 +530,7 @@ void verlet_int(particles & vd, real_number dt) real_number dt205 = dt*dt*0.5; real_number dt2 = dt*2.0; - verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205); + CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205); // remove the particles marked remove_marked<red>(vd); @@ -605,7 +606,7 @@ void euler_int(particles & vd, real_number dt) real_number dt205 = dt*dt*0.5; - euler_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt205); + CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205); // remove the particles remove_marked<red>(vd); @@ -681,7 +682,7 @@ inline void sensor_pressure(Vector & vd, // if the probe is inside the processor domain if (vd.getDecomposition().isLocal(probes.get(i)) == true) { - sensor_pressure_gpu<<<1,1>>>(vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); + CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); vd.merge<Pressure>(NN); @@ -720,17 +721,19 @@ int main(int argc, char* argv[]) // initialize the library openfpm_init(&argc,&argv); +#if !defined(CUDA_ON_CPU) && !defined(__HIP__) cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); +#endif // It contain for each time-step the value detected by the probes openfpm::vector<openfpm::vector<real_number>> press_t; openfpm::vector<Point<3,real_number>> probes; - probes.add({0.8779,0.3,0.02}); - probes.add({0.754,0.31,0.02}); + probes.add({0.8779f,0.3f,0.02f}); + probes.add({0.754f,0.31f,0.02f}); // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y - Box<3,real_number> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.511}); + Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f}); size_t sz[3] = {413,179,133}; // Fill W_dap @@ -748,7 +751,7 @@ int main(int argc, char* argv[]) // You can ignore all these dp/2.0 is a trick to reach the same initialization // of Dual-SPH that use a different criteria to draw particles - Box<3,real_number> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0}); + Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f}); // return an iterator to the fluid particles to add to vd auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box); @@ -797,12 +800,12 @@ int main(int argc, char* argv[]) } // Recipient - Box<3,real_number> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0}); - Box<3,real_number> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0}); + Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f}); + Box<3,real_number> recipient2({dp,dp,dp},{1.6f-dp/2.0f,0.67f-dp/2.0f,0.4f+dp/2.0f}); - Box<3,real_number> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0}); - Box<3,real_number> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0}); - Box<3,real_number> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45}); + Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f}); + Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f}); + Box<3,real_number> obstacle3({0.9f+dp,0.24f,0.0f},{1.02f,0.36f,0.45f}); openfpm::vector<Box<3,real_number>> holes; holes.add(recipient2); @@ -869,8 +872,7 @@ int main(int argc, char* argv[]) // Ok the initialization is done on CPU on GPU we are doing the main loop, so first we offload all properties on GPU vd.hostToDevicePos(); - vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity>(); - + vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity,velocity_prev>(); vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE); @@ -889,7 +891,6 @@ int main(int argc, char* argv[]) Vcluster<> & v_cl = create_vcluster(); timer it_time; - ////// Do rebalancing every 200 timesteps it_reb++; if (it_reb == 300) @@ -913,7 +914,7 @@ int main(int argc, char* argv[]) // it sort the vector (doesn not seem to produce some advantage) // note force calculation is anyway sorted calculation - vd.make_sort(NN); + //vd.make_sort(NN); // Calculate pressure from the density EqState(vd); @@ -922,10 +923,10 @@ int main(int argc, char* argv[]) vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE); - // Calc forces calc_forces(vd,NN,max_visc,cnt); + // Get the maximum viscosity term across processors v_cl.max(max_visc); v_cl.execute(); diff --git a/example/Vector/7_SPH_dlb_opt/Makefile b/example/Vector/7_SPH_dlb_opt/Makefile index 4fe69bac6969ee9b0ac435a3a7c2d1874ae77fe4..f11bc9dfcdfef0570796ec719e7aed00afff356c 100644 --- a/example/Vector/7_SPH_dlb_opt/Makefile +++ b/example/Vector/7_SPH_dlb_opt/Makefile @@ -13,14 +13,14 @@ sph_dlb_test: OPT += -DTEST_RUN sph_dlb_test: sph_dlb %.o: %.cpp - $(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) sph_dlb: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) run: sph_dlb_test - mpirun -np 2 ./sph_dlb + mpirun --oversubscribe -np 2 ./sph_dlb .PHONY: clean all run diff --git a/example/Vector/8_DEM/Makefile b/example/Vector/8_DEM/Makefile index c72e331b7126b8344d8b0856a6154bfd0fa04613..180412c2047b05c85a3b2cfc23c9cd50ada472c9 100644 --- a/example/Vector/8_DEM/Makefile +++ b/example/Vector/8_DEM/Makefile @@ -14,14 +14,14 @@ dem_test: OPT += -DTEST_RUN dem_test: all %.o: %.cpp - $(CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) dem: $(OBJ) $(CC) -o $@ $^ $(OPT) $(CFLAGS) $(LIBS_PATH) $(LIBS) run: dem_test - mpirun -np 2 ./dem + mpirun --oversubscribe -np 2 ./dem .PHONY: clean all run diff --git a/example/Vector/9_gpu_cuda_interop/Makefile b/example/Vector/9_gpu_cuda_interop/Makefile index 696f22c3ce1ffb6410279337197559632cbb1610..0f90d34d705ce4698615cc1b528762eb1163cd13 100644 --- a/example/Vector/9_gpu_cuda_interop/Makefile +++ b/example/Vector/9_gpu_cuda_interop/Makefile @@ -1,26 +1,31 @@ include ../../example.mk - -ifeq (, $(shell which nvcc)) - CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) +ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ CUDA_OPTIONS= else - CUDA_CC=nvcc -ccbin=mpic++ - CUDA_CC_LINK=nvcc -ccbin=mpic++ - CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS= + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo + endif endif - OBJ = main.o gpu_interop: %.o: %.cu - $(CUDA_CC) -O3 -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CUDA_CC) -O3 -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp - $(CUDA_CC) -O3 $(OPT) -g -c --std=c++11 -o $@ $< $(INCLUDE_PATH) + $(CUDA_CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) gpu_interop: $(OBJ) $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS) @@ -28,7 +33,7 @@ gpu_interop: $(OBJ) all: gpu_interop run: gpu_interop - mpirun -np 2 ./gpu_interop + mpirun --oversubscribe -np 2 ./gpu_interop .PHONY: clean all run diff --git a/gdbgui b/gdbgui new file mode 160000 index 0000000000000000000000000000000000000000..d39f0e88c48e33fc3fc5aa9f98171d4eae4914cb --- /dev/null +++ b/gdbgui @@ -0,0 +1 @@ +Subproject commit d39f0e88c48e33fc3fc5aa9f98171d4eae4914cb diff --git a/gpl-3.0.txt b/gpl-3.0.txt new file mode 100644 index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 --- /dev/null +++ b/gpl-3.0.txt @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + <program> Copyright (C) <year> <name of author> + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +<https://www.gnu.org/licenses/>. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +<https://www.gnu.org/licenses/why-not-lgpl.html>. diff --git a/install b/install index 89ba02f1301c46a09fc3136d6976557df8f7441c..fff9a319fedad10cbe57393abada4183b8ca4b91 100755 --- a/install +++ b/install @@ -1,4 +1,5 @@ #! /bin/bash + source script/help source script/discover_os source script/show_solutions @@ -8,6 +9,8 @@ source script/set_mpi source script/conf_PETSC source script/update_openfpm source script/solve_python +source script/install_gdbgui + ### switch to the branch #b_switch=`git rev-parse --abbrev-ref HEAD` @@ -23,7 +26,7 @@ fi ## Check that your home is not empty if [ x"$HOME" == x"" ]; then - echo -e "Your $HOME folder is empty \033[91;5;1m FAILED \033[0m" + echo -e "Your \$HOME folder is empty \033[91;5;1m FAILED \033[0m" exit 1 fi @@ -34,8 +37,9 @@ nomake=0 update_openfpm=0 upgrade_openfpm=0 gpu_support=0 +no_lin=0 -while getopts di:smghc:nu FLAG; do +while getopts di:v:smghc:nul FLAG; do case $FLAG in d) echo "Disable depencencies installation" @@ -67,8 +71,16 @@ while getopts di:smghc:nu FLAG; do ;; n) echo "Upgrading openfpm" - upgrade_openfpm=1 + upgrade_openfpm=1 + ;; + l) + echo "No linear algebra" + no_lin=1 ;; + v) + echo "Relocating openfpm_vars file $OPTARG" + VARS_FILE_LOCATION=$OPTARG + ;; h) #show help HELP ;; @@ -216,7 +228,11 @@ if [ ! -d "$i_dir/PETSC" -o ! -f "$i_dir/PETSC/include/petsc.h" -o ! -d "$i_dir/ if [ $sq -eq 0 ]; then read inst_lin_alg else - inst_lin_alg="y" + if [ $no_lin -eq 1 ]; then + inst_lin_alg="n" + else + inst_lin_alg="y" + fi fi ### PETSC require Python @@ -376,7 +392,7 @@ else echo "Error the installation of VCDEVEL failed" exit 1 fi - ./script/install_VCDEVEL.sh $i_dir $ncore + ./script/install_VCDEVEL.sh $i_dir $ncore $CC $CXX configure_options=" $configure_options --with-vcdevel=$i_dir/VCDEVEL " VCDEVEL_installed=1 elif [ $conf_err -ne 0 ]; then @@ -400,25 +416,35 @@ fi install_base=$(cat install_dir) openmp_flags="$(cat openmp_flags)" cuda_include_dirs=$(cat cuda_include) -mpi_include_dirs=$(cat mpi_include) -mpi_libs=$(cat mpi_libs) +hip_enabled=$(cat hip_enabled) +if [ x"$hip_enabled" == x"1" ]; then + mpi_include_dirs=$(cat mpi_include) + mpi_libs=$(cat mpi_libs) +fi +cuda_on_cpu=$(cat cuda_on_cpu) +optional_boost=$(cat optional_boost_libs) if [ -d "$i_dir/HDF5/lib" ]; then hdf5_lib=$i_dir/HDF5/lib + hdf5_lib_dir=-L$i_dir/HDF5/lib elif [ -d "$i_dir/HDF5/lib64" ]; then hdf5_lib=$i_dir/HDF5/lib64 + hdf5_lib_dir=-L$i_dir/HDF5/lib64 fi -echo "INCLUDE_PATH=$cuda_include_dirs -Wno-deprecated-declarations $openmp_flags -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/VCDEVEL/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include $lin_alg_inc" > example.mk -echo "LIBS_PATH=$openmp_flags -L$install_base/openfpm_devices/lib -L$install_base/openfpm_pdata/lib -L$install_base/openfpm_vcluster/lib -L$i_dir/VCDEVEL/lib -L$i_dir/METIS/lib -L$i_dir/PARMETIS/lib -L$i_dir/BOOST/lib -L$hdf5_lib -L$i_dir/LIBHILBERT/lib $lin_alg_dir " >> example.mk +echo "INCLUDE_PATH=$mpi_include_dirs $cuda_include_dirs -Wno-deprecated-declarations $openmp_flags -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/VCDEVEL/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include $lin_alg_inc" > example.mk +echo "LIBS_PATH=$mpi_libs $openmp_flags -L$install_base/openfpm_devices/lib -L$install_base/openfpm_pdata/lib -L$install_base/openfpm_vcluster/lib -L$i_dir/VCDEVEL/lib -L$i_dir/METIS/lib -L$i_dir/PARMETIS/lib -L$i_dir/BOOST/lib $hdf5_lib_dir -L$i_dir/LIBHILBERT/lib $lin_alg_dir " >> example.mk +if [ x"$cuda_on_cpu" == x"YES" ]; then + echo "CUDA_ON_CPU=YES" >> example.mk +fi if [ x"$gpu_support" == x"1" ]; then - echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $(cat cuda_lib) $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk - echo "LIBS_SE2=-lvcluster -lofpmmemory_se2 -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $(cat cuda_lib) $lin_alg_lib -lboost_filesystem -lboost_system" >> example.mk + echo "LIBS=$mpi_libs -lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $(cat cuda_lib) $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk + echo "LIBS_CUDA_ON_CPU=-lvcluster_cuda_on_cpu -lofpmmemory_cuda_on_cpu -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $(cat cuda_lib) $lin_alg_lib -lboost_filesystem -lboost_system -lboost_context" >> example.mk else - echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $lin_alg_lib -ldl -lboost_filesystem -lboost_system" >> example.mk - echo "LIBS_SE2=-lvcluster -lofpmmemory_se2 -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $lin_alg_lib -lboost_filesystem -lboost_system" >> example.mk + echo "LIBS=-lvcluster -lofpm_pdata -lofpmmemory -lparmetis -lmetis -lboost_iostreams -lboost_program_options -lhdf5 -llibhilbert -lVc $lin_alg_lib -ldl -lboost_filesystem -lboost_system $optional_boost" >> example.mk + echo "LIBS_CUDA_ON_CPU=\$(LIBS)" >> example.mk fi -echo "INCLUDE_PATH_NVCC=-Xcompiler="-Wno-deprecated-declarations" $(cat openmp_flags) "$(cat cuda_options)" -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include $lin_alg_inc" >> example.mk +echo "INCLUDE_PATH_NVCC=-Xcompiler="-Wno-deprecated-declarations" $(cat openmp_flags) "$(cat cuda_options)" $mpi_include_dirs -I. -I$install_base/openfpm_numerics/include -I$install_base/openfpm_pdata/include/config -I$install_base/openfpm_pdata/include -I$install_base/openfpm_data/include -I$install_base/openfpm_vcluster/include -I$install_base/openfpm_io/include -I$install_base/openfpm_devices/include -I$i_dir/METIS/include -I$i_dir/PARMETIS/include -I$i_dir/BOOST/include -I$i_dir/HDF5/include -I$i_dir/LIBHILBERT/include $lin_alg_inc" >> example.mk cp example.mk src/example.mk cp example.mk example/example.mk @@ -431,6 +457,13 @@ if [ $? -ne 0 ]; then conf_err=1 fi +################ Parallel debugger ####################### + +echo "Installing parallel debugger" +install_gdbgui $i_dir + +########################################################### + echo "Command used to configure" echo "" echo -e "\033[1m ./configure $options $configure_options \033[0m " @@ -444,6 +477,7 @@ fi bash_path="export PATH=\"" + echo -e "\033[1;34;5m --------------------------------------- \033[0m" echo -e "\033[1;34;5m --------- INSTALLATION REPORT --------- \033[0m" echo "" @@ -532,6 +566,7 @@ if [ x"$platform" == x"cygwin" ]; then bash_path="$bash_path:$i_dir/BOOST/bin:$i_dir/HDF5/bin" fi +bash_path="$bash_path:$install_base/gdbgui/bin:" bash_path="$bash_path:\$PATH\"" bash_library="$bash_library\"" @@ -539,11 +574,16 @@ bash_library="$bash_library\"" # in cygwin we have to add to PATH additional directories -echo "$bash_path" > $HOME/openfpm_vars -echo "$bash_library" >> $HOME/openfpm_vars +if [ x"$VARS_FILE_LOCATION" == x"" ]; then + VARS_FILE_LOCATION=$HOME +fi + +echo "$bash_path" > $VARS_FILE_LOCATION/openfpm_vars +echo "$bash_library" >> $VARS_FILE_LOCATION/openfpm_vars +echo "export PURE_PYTHON=1" >> $VARS_FILE_LOCATION/openfpm_vars if [ x"$platform" = x"osx" ]; then - echo "TMPDIR=/tmp/" >> $HOME/openfpm_vars + echo "TMPDIR=/tmp/" >> $VARS_FILE_LOCATION/openfpm_vars fi echo -e "$installation_report" @@ -562,12 +602,12 @@ fi echo "" if [ x"$platform" = x"linux" ]; then - echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $HOME/.bashrc" + echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $VARS_FILE_LOCATION/.bashrc" else - echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $HOME/.bash_profile" + echo "Before run any openfpm program you must execute the following command. A simple way would be to append this line at the end of your $VARS_FILE_LOCATION/.bash_profile" fi echo "" -echo -e "\033[1m source $HOME/openfpm_vars \033[0m" +echo -e "\033[1m source $VARS_FILE_LOCATION/openfpm_vars \033[0m" echo "" echo "" echo -e "Remember to do: \033[91;5;1m make install \033[0m" diff --git a/install_MPI_mpich.sh b/install_MPI_mpich.sh index d346eb6fa90ce6970834fd451438b2329cdeb7ff..a8ce42181005dc020be834e220e144a2f34c83fe 100755 --- a/install_MPI_mpich.sh +++ b/install_MPI_mpich.sh @@ -3,9 +3,32 @@ # check if the directory $1/MPI exist if [ -d "$1/MPI" ]; then - echo "MPI already installed" - exit 0 + version=$(cat $1/MPI/version) + if [ x"$version" != x"10" ]; then + echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" + echo -e "\033[1;34;5m MPICH has been updated to version 3.3.0, the component will be updated automatically \033[0m" + echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" + sleep 5 + rm -rf $1/MPI/include + rm -rf $1/MPI/lib + rm -rf $1/MPI/bin + rm -rf $1/MPI/etc + rm -rf $1/MPI/share + rm -rf $1/MPI + rm -rf $1/HDF5 + rm -rf $1/ZLIB + rm -rf $1/PARMETIS + rm -rf $1/PETSC + rm -rf $1/TRILINOS + rm -rf $1/HYPRE + rm -rf $1/MUMPS + rm -rf $1/SUPERLU_DIST + else + echo "MPI already installed" + exit 0 + fi fi + rm -rf mpich-3.3 rm mpich-3.3.tar.gz wget http://ppmcore.mpi-cbg.de/upload/mpich-3.3.tar.gz @@ -37,5 +60,5 @@ make -j $2 make install # Mark the installation -echo 6 > $1/MPI/version +echo 10 > $1/MPI/version diff --git a/openfpm_data b/openfpm_data index 42396dea71e4df42c7aa95ca2c394db9317927c1..d78de3919144686d1e88a2c46a88ad3fa2a79043 160000 --- a/openfpm_data +++ b/openfpm_data @@ -1 +1 @@ -Subproject commit 42396dea71e4df42c7aa95ca2c394db9317927c1 +Subproject commit d78de3919144686d1e88a2c46a88ad3fa2a79043 diff --git a/openfpm_devices b/openfpm_devices index aa00b03d64ff91d34060ae4d8b01a5b19896f5a7..a7c955b6720dd482b2d2258b27dd4e6148230792 160000 --- a/openfpm_devices +++ b/openfpm_devices @@ -1 +1 @@ -Subproject commit aa00b03d64ff91d34060ae4d8b01a5b19896f5a7 +Subproject commit a7c955b6720dd482b2d2258b27dd4e6148230792 diff --git a/openfpm_io b/openfpm_io index 5e408377640e8bd56b7d5e1905aca49c34655656..88095d3038172a62770f6320cfce84f82f25fed0 160000 --- a/openfpm_io +++ b/openfpm_io @@ -1 +1 @@ -Subproject commit 5e408377640e8bd56b7d5e1905aca49c34655656 +Subproject commit 88095d3038172a62770f6320cfce84f82f25fed0 diff --git a/openfpm_pdata.doc b/openfpm_pdata.doc index 1ecffa9ccc680a2e2aafe5aa285dbff1a449d816..c29086dba9ff93b6146095ab3273bb9128d9b357 100644 --- a/openfpm_pdata.doc +++ b/openfpm_pdata.doc @@ -38,7 +38,7 @@ PROJECT_NAME = "OpenFPM_pdata" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 3.1.0 +PROJECT_NUMBER = 3.3.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/openfpm_vcluster b/openfpm_vcluster index 98c601ed8d8b6f17c349ea479aa98ba22b90a069..974a494a7cac5d47ae04921393d09f57fd3039f8 160000 --- a/openfpm_vcluster +++ b/openfpm_vcluster @@ -1 +1 @@ -Subproject commit 98c601ed8d8b6f17c349ea479aa98ba22b90a069 +Subproject commit 974a494a7cac5d47ae04921393d09f57fd3039f8 diff --git a/script/detect_gcc b/script/detect_gcc index 24a5a5347a15799f827eca81ccbc4c82faefa6b2..0cbcf16b8e7531f342245f6b5388a56992d381ac 100755 --- a/script/detect_gcc +++ b/script/detect_gcc @@ -218,7 +218,14 @@ function detect_compiler() echo "Two different valid compilers has been found please choose one" commands[0]="icpc" commands[1]="g++" - possible_solutions "${commands[@]}" + + choice=$(cat default_choice_compiler) + if [ -f default_choice_compiler ]; then + possible_solutions_command=$(cat default_choice_compiler) + echo "Choosen: $possible_solutions_command" + else + possible_solutions "${commands[@]}" + fi if [ x"$possible_solutions_command" == x"icpc" ]; then CXX=icpc CC=icc diff --git a/script/download_MPI.sh b/script/download_MPI.sh index cd8113605a9055d900406a0d9420b1dd0ae286b9..87c15208e4449bf7ca1a8b0a74bcf82f1c7a880f 100755 --- a/script/download_MPI.sh +++ b/script/download_MPI.sh @@ -2,8 +2,8 @@ # check if the directory $1/MPI exist -rm -rf openmpi-4.0.4 -rm openmpi-4.0.4.tar.gz -wget http://ppmcore.mpi-cbg.de/upload/openmpi-4.0.4.tar.gz -tar -xvf openmpi-4.0.4.tar.gz +rm -rf openmpi-4.1.1 +rm openmpi-4.1.1.tar.gz +wget http://ppmcore.mpi-cbg.de/upload/openmpi-4.1.1.tar.gz +tar -xvf openmpi-4.1.1.tar.gz diff --git a/script/help b/script/help index d6492daf9208d3447c4ab4059a8248f1a61941e9..8ef53c443a0d832020cc14c62d1e0b50b92c0739 100755 --- a/script/help +++ b/script/help @@ -14,6 +14,7 @@ function HELP { echo -e " \033[1;34m-c\033[0m foward this options to configure" echo -e " \033[1;34m-h\033[0m Displays this help message"\\n echo -e " \033[1;34m-m\033[0m Skip compilation"\\n + echo -e " \033[1;34m-l\033[0m do not install linear algebra"\\n echo -e " \033[1mExample:\033[0m ./install -i /dependencies/here -s -c\"some_options someother_option\""\\n exit 1 } diff --git a/script/install_BOOST.sh b/script/install_BOOST.sh index 723da2c4458ce197211398b6687db9e0bad50675..f401534545f4d58be883d31bc027b75a976b7b82 100755 --- a/script/install_BOOST.sh +++ b/script/install_BOOST.sh @@ -7,9 +7,10 @@ if [ -d "$1/BOOST" ]; then exit 0 fi -wget http://ppmcore.mpi-cbg.de/upload/boost_1_72_0.tar.bz2 -tar -xvf boost_1_72_0.tar.bz2 -cd boost_1_72_0 +rm boost_1_75_0.tar.bz2 +wget http://ppmcore.mpi-cbg.de/upload/boost_1_75_0.tar.bz2 +tar -xvf boost_1_75_0.tar.bz2 +cd boost_1_75_0 if [ x"$4" != x"" ]; then if [ -f $HOME/user-config.jam ]; then mv $HOME/user-config.jam $HOME/user-config.jam_bck @@ -23,10 +24,10 @@ fi ./bootstrap.sh --with-toolset=$3 mkdir $1/BOOST ./b2 -j $2 install --prefix=$1/BOOST -rm -rf boost_1_72_0 +rm -rf boost_1_75_0 if [ -f $HOME/user-config.jam_bck ]; then mv $HOME/user-config.jam_bck $HOME/user-config.jam fi -rm -rf boost_1_72_0.tar.bz2 +rm -rf boost_1_75_0.tar.bz2 diff --git a/script/install_EIGEN.sh b/script/install_EIGEN.sh index b42a5ce9de72621f09bd991b7cc4384b44cf8841..342ee3426be9eb86afcd1ca99f70669915fb0dfd 100755 --- a/script/install_EIGEN.sh +++ b/script/install_EIGEN.sh @@ -32,4 +32,4 @@ rm -rf eigen-eigen-b3f3d4950030/ touch $1/EIGEN/signature_of_eigen3_matrix_library # Mark the installation -echo 2 > $1/EIGEN/version +echo 2 > $1/EIGEN/version_eigen_lib diff --git a/script/install_HDF5.sh b/script/install_HDF5.sh index f44ac3df67fe6ff48e140fe16cd0577bf2d48fcb..6477e1869f69f6904cff24b7229249e0a5ff2725 100755 --- a/script/install_HDF5.sh +++ b/script/install_HDF5.sh @@ -24,7 +24,7 @@ if [ ! -d "$1/ZLIB" -a x"$platform" != x"cygwin" ]; then CC=mpicc ./configure --prefix=$1/ZLIB make -j $2 - + cd .. if [ $? -eq 0 ]; then make check install else @@ -37,9 +37,9 @@ else fi ### 1.8.19 does not compile on CYGWIN -wget http://ppmcore.mpi-cbg.de/upload/hdf5-1.10.6.tar.gz -tar -xf hdf5-1.10.6.tar.gz -cd hdf5-1.10.6 +wget http://ppmcore.mpi-cbg.de/upload/hdf5-1.10.7.tar.gz +tar -xf hdf5-1.10.7.tar.gz +cd hdf5-1.10.7 if [ x"$platform" != x"cygwin" ]; then CC=mpicc ./configure --with-zlib=$1/ZLIB --enable-parallel --prefix=$1/HDF5 @@ -54,4 +54,4 @@ if [ $? -ne 0 ]; then echo "HDF5 error installing" exit 0 fi -echo 2 > $1/HDF5/version +echo 3 > $1/HDF5/version diff --git a/script/install_MPI.sh b/script/install_MPI.sh index 57ae2b2c7ee86c3580273dda3637ee7782c7017e..66fa52740360c63d247543d9664dc41d8006b642 100755 --- a/script/install_MPI.sh +++ b/script/install_MPI.sh @@ -8,7 +8,7 @@ if [ -d "$1/MPI" ]; then fi ./script/download_MPI.sh -cd openmpi-4.0.4 +cd openmpi-4.1.1 if [ -f ../mpi_add_options ]; then mpi_options=$(cat ../mpi_add_options) @@ -21,14 +21,14 @@ if [ x"$3" == x"1" ]; then # Detect where is nvcc cuda_location=$(dirname $(dirname $(which nvcc)) ) - ./configure $mpi_options --with-cuda=$cuda_location --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8 + ./configure --with-hwloc=internal --with-libevent=internal $mpi_options --with-cuda=$cuda_location --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8 else echo "Installing MPI without GPU support" - ./configure $mpi_options --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8 + ./configure --with-hwloc=internal --with-libevent=internal $mpi_options --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $8 fi make -j $2 make install # Mark the installation -echo 6 > $1/MPI/version +echo 10 > $1/MPI/version diff --git a/script/install_OPENBLAS.sh b/script/install_OPENBLAS.sh index 59ba51656e322e61b7c52098c334e60b206f0c81..0b93190d753dafddaa576e3a94e49690be8a34b1 100755 --- a/script/install_OPENBLAS.sh +++ b/script/install_OPENBLAS.sh @@ -1,5 +1,9 @@ #! /bin/bash +source script/discover_os + +discover_os + # check if the directory $1/OPENBLAS exist if [ -d "$1/OPENBLAS" ]; then @@ -7,10 +11,19 @@ if [ -d "$1/OPENBLAS" ]; then exit 0 fi -rm -rf OpenBLAS-0.3.10.tar.gz -wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.3.10.tar.gz -tar -xf OpenBLAS-0.3.10.tar.gz -cd OpenBLAS-0.3.10 +if [ x"$platform" == x"darwin" ]; then + rm -rf OpenBLAS-0.3.10 + rm -rf OpenBLAS-0.3.10.tar.gz + wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.3.10.tar.gz + tar -xf OpenBLAS-0.3.10.tar.gz + cd OpenBLAS-0.3.10 +else + rm -rf OpenBLAS-0.3.13 + rm -rf OpenBLAS-0.3.13.tar.gz + wget http://ppmcore.mpi-cbg.de/upload/OpenBLAS-0.3.13.tar.gz + tar -xf OpenBLAS-0.3.13.tar.gz + cd OpenBLAS-0.3.13 +fi #wget http://ppmcore.mpi-cbg.de/upload/openblas.diff #patch -p1 < openblas.diff @@ -26,8 +39,9 @@ make install PREFIX=$1/OPENBLAS if [ ! "$(ls -A $1/OPENBLAS)" ]; then rm -rf $1/OPENBLAS else + rm -rf OpenBLAS-0.3.13 rm -rf OpenBLAS-0.3.10 - echo 2 > $1/OPENBLAS/version + echo 3 > $1/OPENBLAS/version exit 0 fi diff --git a/script/install_PETSC.sh b/script/install_PETSC.sh index 6f98250c9714eb419784ec1e8b960d268862584b..12079ff67f90b1f8e8299405ba9316b1d600f8b3 100755 --- a/script/install_PETSC.sh +++ b/script/install_PETSC.sh @@ -19,7 +19,7 @@ source script/solve_python discover_os function test_configure_options() { - cd petsc-3.13.3 + cd petsc-3.14.5 $python_command ./configure COPTFLAGS="-O3 -g" CXXOPTFLAGS="-O3 -g" FOPTFLAGS="-O3 -g" $ldflags_petsc --with-cxx-dialect=C++11 $petsc_openmp --with-mpi-dir=$mpi_dir $configure_options2 --with-debugging=0 error=$? cd .. @@ -49,14 +49,14 @@ fi #### Download and uncompress petsc -rm petsc-lite-3.13.3.tar.gz -rm -rf petsc-3.13.3 -wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.13.3.tar.gz +rm petsc-lite-3.14.5.tar.gz +rm -rf petsc-3.14.5 +wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.14.5.tar.gz if [ $? -ne 0 ]; then echo -e "\033[91;5;1m FAILED! Installation requires an Internet connection \033[0m" exit 1 fi -tar -xf petsc-lite-3.13.3.tar.gz +tar -xf petsc-lite-3.14.5.tar.gz #### @@ -143,15 +143,15 @@ if [ $error -eq 0 ]; then fi -rm petsc-lite-3.13.3.tar.gz -rm -rf petsc-3.13.3 -wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.13.3.tar.gz +rm petsc-lite-3.14.5.tar.gz +rm -rf petsc-3.14.5 +wget http://ppmcore.mpi-cbg.de/upload/petsc-lite-3.14.5.tar.gz if [ $? -ne 0 ]; then echo -e "\033[91;5;1m FAILED! Installation requires an Internet connection \033[0m" exit 1 fi -tar -xf petsc-lite-3.13.3.tar.gz -cd petsc-3.13.3 +tar -xf petsc-lite-3.14.5.tar.gz +cd petsc-3.14.5 if [ x"$CXX" != x"icpc" ]; then @@ -176,7 +176,6 @@ else } $python_command ./configure COPTFLAGS="-O3 -g" CXXOPTFLAGS="-O3 -g" FOPTFLAGS="-O3 -g" $ldflags_petsc --with-cxx-dialect=C++11 $petsc_openmp --with-mpi-dir=$mpi_dir $configure_options --prefix=$1/PETSC --with-debugging=0 - fi make all @@ -187,7 +186,7 @@ if [ ! "$(ls -A $1/PETSC)" ]; then rm -rf $1/PETSC else #Mark the installation - echo 4 > $1/PETSC/version + echo 6 > $1/PETSC/version exit 0 fi diff --git a/script/install_SUITESPARSE.sh b/script/install_SUITESPARSE.sh index d2ceafc1561aff46c37b7c2cae23cd0171e9c8ca..3aa4bedd8ed917a21ca460ddec73de0e27865fb5 100755 --- a/script/install_SUITESPARSE.sh +++ b/script/install_SUITESPARSE.sh @@ -6,7 +6,7 @@ source script/discover_os discover_os # check if the directory $1/SUITESPARSE exist - +rm -rf SuiteSparse-5.7.2 if [ -d "$1/SUITESPARSE" -a -f "$1/SUITESPARSE/include/umfpack.h" ]; then echo "SUITESPARSE is already installed" exit 0 @@ -35,16 +35,15 @@ if [ x"$platform" == x"cygwin" ]; then fi echo "Compiling SuiteSparse without CUDA (old variable $CUDA)" -LDLIBS="$STS_LIB -lm" LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$1/OPENBLAS/lib" make library -j $2 "CUDA=no" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=-lopenblas" +LDLIBS="$STS_LIB -lm" LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$1/OPENBLAS/lib" make library -j $2 "CC=$CC" "CXX=$CXX" "CUDA=no" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=-lopenblas" if [ $? != 0 ]; then echo "Failed to compile SuiteSparse" exit 1 fi echo "Making library" -make library "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=" +make library "CC=$CC" "CXX=$CXX" "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=" echo "Making install" -make install "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=" +make install "CC=$CC" "CXX=$CXX" "CUDA=no" "INSTALL=$1/SUITESPARSE" "INSTALL_LIB=$1/SUITESPARSE/lib" "INSTALL_INCLUDE=$1/SUITESPARSE/include" "BLAS=-L$1/OPENBLAS/lib -lopenblas -pthread" "LAPACK=" # Mark the installation echo 2 > $1/SUITESPARSE/version -rm -rf SuiteSparse rm SuiteSparse-5.7.2.tar.gz diff --git a/script/install_VCDEVEL.sh b/script/install_VCDEVEL.sh index 177e622e095d35a58a5bf93cb840baa80e04d030..28ae094732ee7c9ef27dda04c4290c6f14cae9d3 100755 --- a/script/install_VCDEVEL.sh +++ b/script/install_VCDEVEL.sh @@ -13,7 +13,7 @@ tar -xf Vc-1.4.1.tar.gz cd Vc-1.4.1 mkdir build cd build -cmake -DCMAKE_INSTALL_PREFIX:PATH=$1/VCDEVEL .. +cmake -DCMAKE_INSTALL_PREFIX:PATH=$1/VCDEVEL -DCMAKE_C_COMPILER=$3 -DCMAKE_CXX_COMPILER=$4 .. make make install diff --git a/script/install_gdbgui b/script/install_gdbgui new file mode 100755 index 0000000000000000000000000000000000000000..63d5098805dd320acba9bbeb871460ea7d282b48 --- /dev/null +++ b/script/install_gdbgui @@ -0,0 +1,162 @@ +#! /bin/bash + +function install_gdbgui() { +source script/solve_pip3 +source script/solve_pip3_nox +source script/solve_npm +source script/solve_npm_yarn +source script/show_solutions +source script/solve_gdbserver +source script/discover_package_manager +source script/solve_nodejs +source script/solve_python_devel +source script/solve_llvm + +node_js_inst=0 + +if [ x"$platform" == x"osx" ]; then + solve_llvm $platform + + echo "OSX installing lldb-mi" + rm lldb-mi-master.tar.gz + wget http://ppmcore.mpi-cbg.de/upload/lldb-mi-master.tar.gz + tar -xf lldb-mi-master.tar.gz + cd lldb-mi-master + mkdir build + cd build + cmake ../. -DCMAKE_PREFIX_PATH=$1/LLDB-MI + make +fi + +solve_python_devel $platform + +if haveProg node; then + + dgc_major=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\1/g') + dgc_middle=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\2/g') + dgc_minor=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\3/g') + + if [[ ( $dgc_major -ge 10 ) && ( $dgc_middle -ge 14 ) ]]; then + echo -e "node\033[92;1m SUCCESS \033[0m" + node_js_inst=1 + fi +else + echo >&2 + echo -e "node\033[91;5;1m FAILED \033[0m" + node_js_inst=0 +fi + +if [ x"$node_js_inst" = x"0" ]; then + echo "OpenFPM require node.js but it's not installed, searching a solution... " + solve_nodejs $platform + + if haveProg node; then + + dgc_major=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\1/g') + dgc_middle=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\2/g') + dgc_minor=$(node --version | grep v | sed 's/v\([0-9][0-9]*\)\.\([0-9][0-9]*\)\.\([0-9][0-9]*\)/\3/g') + + if [[ ( $dgc_major -ge 10 ) && ( $dgc_middle -ge 14 ) ]]; then + echo -e "node\033[92;1m SUCCESS \033[0m" + fi + else + echo >&2 + echo -e "node\033[91;5;1m FAILED \033[0m" + fi + +fi + +if haveProg gdbserver; then + echo -e "gdbserver\033[92;1m SUCCESS \033[0m" +else + echo >&2 + echo -e "gdbserver\033[91;5;1m FAILED \033[0m" + echo "OpenFPM require gdb but it's not installed, searching a solution... " + solve_gdbserver $platform + if haveProg gdbserver; then + echo -e "gdbserver\033[92;1m SUCCESS \033[0m" + else + echo -e "gdbserver\033[91;5;1m FAILED \033[0m" + echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m" + return + fi +fi + +if haveProg pip3; then + echo -e "pip3\033[92;1m SUCCESS \033[0m" +else + echo >&2 + echo -e "pip3\033[91;5;1m FAILED \033[0m" + echo "OpenFPM require pip3 but it's not installed, searching a solution... " + solve_pip3 $platform + if haveProg pip3; then + echo -e "pip3\033[92;1m SUCCESS \033[0m" + else + echo -e "pip3\033[91;5;1m FAILED \033[0m" + echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m" + return + fi +fi + +if haveProg nox; then + echo -e "nox\033[92;1m SUCCESS \033[0m" +else + echo >&2 + echo -e "nox\033[91;5;1m FAILED \033[0m" + echo "OpenFPM require nox but it's not installed, searching a solution... " + solve_pip3_nox $platform + if haveProg nox; then + echo -e "nox\033[92;1m SUCCESS \033[0m" + else + echo -e "nox\033[91;5;1m FAILED \033[0m" + echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m" + return + fi +fi + +if haveProg npm; then + echo -e "npm\033[92;1m SUCCESS \033[0m" +else + echo >&2 + echo -e "npm\033[91;5;1m FAILED \033[0m" + echo "OpenFPM require nox but it's not installed, searching a solution... " + solve_npm $platform + if haveProg npm; then + echo -e "npm\033[92;1m SUCCESS \033[0m" + else + echo -e "npm\033[91;5;1m FAILED \033[0m" + echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m" + return + fi +fi + + +if haveProg yarn; then + echo -e "yarn\033[92;1m SUCCESS \033[0m" +else + echo >&2 + echo -e "yarn\033[91;5;1m FAILED \033[0m" + echo "OpenFPM require nox but it's not installed, searching a solution... " + solve_npm_yarn $platform + if haveProg yarn; then + echo -e "yarn\033[92;1m SUCCESS \033[0m" + else + echo -e "yarn\033[91;5;1m FAILED \033[0m" + echo -e "Installing parallel debugger\033[91;5;1m FAILED \033[0m" + return + fi +fi + +cd gdbgui +cd gdbgui-mpi +./compile.sh +cd .. +cd .. + +cd gdbgui +nox -s build_executable_current_platform +cd .. + +} + + diff --git a/script/install_parallel_debugger b/script/install_parallel_debugger new file mode 100755 index 0000000000000000000000000000000000000000..13c7409566d1816d86f106121631668f7c8d1b83 --- /dev/null +++ b/script/install_parallel_debugger @@ -0,0 +1,17 @@ +#! /bin/bash +source script/discover_os +discover_os + +if [ x"$platform" == x"$osx" ]; then + exit 0 +fi + +if [ -f gdbgui/executable/linux/gdbgui_0.14.0.2 ]; then + install_d=$(cat install_dir) + mkdir -p "$install_d/gdbgui/bin" + cp gdbgui/executable/linux/gdbgui_0.14.0.2 "$install_d/gdbgui/bin/gdbgui" + cp gdbgui/gdbgui-mpi/print_nodes "$install_d/gdbgui/bin" + cp gdbgui/gdbgui-mpi/launch_mpi_debugger "$install_d/gdbgui/bin" + cp gdbgui/gdbgui-mpi/launch_gdb_server "$install_d/gdbgui/bin" +fi + diff --git a/script/pre_req b/script/pre_req index 755cec9e36ba5c6916090811a6f0d7801beca53b..f5be531abb13a93122e0ae0e1fa9f7445203902b 100755 --- a/script/pre_req +++ b/script/pre_req @@ -220,6 +220,9 @@ if haveProg mpirun; then if [ x"$possible_solutions_ret" == x"0" ]; then get_openmpi_compilation_options call_test_working_mpi_options=1 + elif [ x"$possible_solutions_ret" == x"1" ]; then + call_test_working_mpi_options=1 + fi else echo "OpenMPI is CUDA aware" diff --git a/script/remove_old b/script/remove_old index 25e6e1ef114eaeeae0c311a7c4e37e90548156bf..ce0b9a91d8eea2654be6c6f9839960010eb95319 100755 --- a/script/remove_old +++ b/script/remove_old @@ -76,10 +76,10 @@ function remove_old() ## Check the installed version of the dependencies if [ -d $1/BOOST ]; then - is_update=$(cat $1/BOOST/include/boost/version.hpp | grep "#define BOOST_VERSION 107200") + is_update=$(cat $1/BOOST/include/boost/version.hpp | grep "#define BOOST_VERSION 107500") if [ x"$is_update" == x"" ]; then echo -e "\033[1;34;5m --------------------------------------------------------------------------- \033[0m" - echo -e "\033[1;34;5m Boost has been updated to 1.72, the component will be updated automatically \033[0m" + echo -e "\033[1;34;5m Boost has been updated to 1.75, the component will be updated automatically \033[0m" echo -e "\033[1;34;5m --------------------------------------------------------------------------- \033[0m" sleep 5 rm -rf $1/BOOST/include @@ -143,9 +143,9 @@ function remove_old() if [ -d $1/PETSC ]; then version=$(cat $1/PETSC/version) - if [ x"$version" != x"4" ]; then + if [ x"$version" != x"6" ]; then echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" - echo -e "\033[1;34;5m PETSC has been updated to version 3.13.3, the component will be updated automatically \033[0m" + echo -e "\033[1;34;5m PETSC has been updated to version 3.14.5, the component will be updated automatically \033[0m" echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" sleep 5 rm -rf $1/PETSC @@ -158,9 +158,9 @@ function remove_old() if [ -d $1/HDF5 ]; then version=$(cat $1/HDF5/version) - if [ x"$version" != x"2" ]; then + if [ x"$version" != x"3" ]; then echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" - echo -e "\033[1;34;5m HDF5 has been updated to version 1.10.6, the component will be updated automatically \033[0m" + echo -e "\033[1;34;5m HDF5 has been updated to version 1.10.7, the component will be updated automatically \033[0m" echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" sleep 5 rm -rf $1/HDF5 @@ -169,9 +169,9 @@ function remove_old() if [ -d $1/MPI ]; then version=$(cat $1/MPI/version) - if [ x"$version" != x"6" ]; then + if [ x"$version" != x"10" ]; then echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" - echo -e "\033[1;34;5m MPI has been updated to version 4.0.4, the component will be updated automatically \033[0m" + echo -e "\033[1;34;5m MPI has been updated to version 4.1.1, the component will be updated automatically \033[0m" echo -e "\033[1;34;5m -------------------------------------------------------------------------------------- \033[0m" sleep 5 rm -rf $1/MPI/include @@ -192,7 +192,7 @@ function remove_old() fi if [ -d $1/EIGEN ]; then - version=$(cat $1/EIGEN/version) + version=$(cat $1/EIGEN/version_eigen_lib) if [ x"$version" != x"2" ]; then echo -e "\033[1;34;5m -------------------------------------------------------------------------------- \033[0m" echo -e "\033[1;34;5m EIGEN has been updated to 3.3.5 , the component will be updated automatically \033[0m" @@ -207,7 +207,7 @@ function remove_old() if [ -d $1/OPENBLAS ]; then version=$(cat $1/OPENBLAS/version) - if [ x"$version" != x"2" ]; then + if [ x"$version" != x"3" ]; then echo -e "\033[1;34;5m ---------------------------------------------------------------------- \033[0m" echo -e "\033[1;34;5m OPENBLAS has been updated, the component will be updated automatically \033[0m" echo -e "\033[1;34;5m ---------------------------------------------------------------------- \033[0m" diff --git a/script/set_mpi b/script/set_mpi index 024faf548033f4952dd569beaaf2a068214f4ea3..ef06cede4b23f3ebeba4546bff627ce039d175bf 100755 --- a/script/set_mpi +++ b/script/set_mpi @@ -4,9 +4,9 @@ function set_mpi() { if [ x"$MPI_valid" == x"yes" ]; then if [ $is_mpi_openmpi -eq 1 ]; then - configure_options="$configure_options CXX=mpic++ --with-mpivendor=openmpi" + configure_options="$configure_options --with-mpivendor=openmpi" else - configure_options="$configure_options CXX=mpic++ " + configure_options="$configure_options " fi else if [ x"$platform" == x"cygwin" ]; then @@ -19,7 +19,7 @@ function set_mpi() fi MPI_installed=1 export PATH="$1/MPI/bin:$PATH" - configure_options="$configure_options CXX=mpic++ --with-mpivendor=openmpi" + configure_options="$configure_options --with-mpivendor=openmpi" fi } @@ -43,52 +43,4 @@ function get_openmpi_compilation_options() done } -function test_working_mpi_options() -{ - script/download_MPI.sh - cd openmpi-3.1.3 - openmpi_working_options=() - for opt in ${openmpi_compilation_options[@]}; do - # prefix and --with-cuda must be avoided - - if [[ $opt == --with-cuda* ]]; then - continue; - fi - - if [[ $opt == --prefix* ]]; then - continue; - fi - - if [[ $opt == --enable-mpi-fortran* ]]; then - continue; - fi - - if [[ $opt == CC* ]]; then - continue; - fi - - if [[ $opt == CXX* ]]; then - continue; - fi - - if [[ $opt == FC* ]]; then - continue; - fi - - if [[ $opt == F77* ]]; then - continue; - fi - - echo "Testing ./configure --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $openmpi_working_options" - ./configure --prefix=$1/MPI --enable-mpi-fortran=yes CC=$4 CXX=$5 F77=$6 FC=$7 $openmpi_working_options $opt - - if [ $? -eq 0 ]; then - openmpi_working_options="$openmpi_working_options $opt" - fi - done - - echo "OpenMPI working options: $openmpi_working_options" - - cd .. -} diff --git a/script/show_solutions b/script/show_solutions index d398e6525bc918b1df4e751bbfe79d8ad9921169..bb2fd0ff178083bf37a96984eceb375e5408392b 100755 --- a/script/show_solutions +++ b/script/show_solutions @@ -2,10 +2,6 @@ function possible_solutions { - if [ $sq -eq 1 ]; then - return - fi - sol=1 echo "Possible solutions:" for com in "$@" @@ -18,7 +14,16 @@ function possible_solutions { sol=`expr $sol + 1` echo "$sol exit the script" echo -e "\033[1;34;5mChoose the solution:\033[0m" - read choose + if [ $sq -eq 1 ]; then + if [ -f default_choice ]; then + choose=$(cat default_choice) + else + echo "No solution choosen, continue ... " + return + fi + else + read choose + fi if [ x"$choose" = x"$sol" ]; then exit 1 fi diff --git a/script/solve_gdbserver b/script/solve_gdbserver new file mode 100755 index 0000000000000000000000000000000000000000..a92893f127b4d5dac842896619333271b05fca70 --- /dev/null +++ b/script/solve_gdbserver @@ -0,0 +1,33 @@ +#! /bin/bash + +function solve_gdbserver() { +source script/show_solutions +source script/discover_package_manager +discover_package_manager $1 +pcman=$discover_package_manager_ret + +if [ x"$pcman" = x"" ]; then + exit 1 +fi + +if [ x"$1" = x"osx" ]; then + commands[0]="$pcman install gdbserver" + possible_solutions "${commands[@]}" +elif [ x"$1" = x"linux" ]; then + if [ x"$pcman" = x"zypper -n" ]; then + package_name=gdbserver + elif [ x"$pcman" = x"pacman" ]; then + package_name=gdb + elif [ x"$pcman" = x"apt-get" ]; then + package_name=gdbsever + else + package_name=gdb-gdbserver + fi + + pc_install_command "$pcman" + commands[0]="su -c \"$pc_install_command_ret $package_name\"" + commands[1]="sudo $pc_install_command_ret $package_name" + possible_solutions "${commands[@]}" +fi +} + diff --git a/script/solve_llvm b/script/solve_llvm new file mode 100755 index 0000000000000000000000000000000000000000..794daa6aeed22d4db5c15470edb11fffc4a90688 --- /dev/null +++ b/script/solve_llvm @@ -0,0 +1,25 @@ +#! /bin/bash + +function solve_llvm() { +source script/show_solutions +source script/discover_package_manager +discover_package_manager $1 +pcman=$discover_package_manager_ret + +if [ x"$pcman" = x"" ]; then + exit 1 +fi + + +if [ x"$pcman" == x"brew" ]; then + base_llvm_pkg="llvm" +fi + +if [ x"$1" = x"osx" ]; then + pc_install_command "$pcman" + commands[0]="su -c \"$pc_install_command_ret $base_python_pkg \"" + commands[1]="sudo $pc_install_command_ret $base_python_pkg " + possible_solutions "${commands[@]}" +fi +} + diff --git a/script/solve_nodejs b/script/solve_nodejs new file mode 100755 index 0000000000000000000000000000000000000000..1f58a2e96ee411ad7cd7f8055940489b741f935f --- /dev/null +++ b/script/solve_nodejs @@ -0,0 +1,26 @@ +#! /bin/bash + +function solve_nodejs() { +source script/show_solutions +source script/discover_package_manager +discover_package_manager $1 +pcman=$discover_package_manager_ret + +if [ x"$pcman" = x"" ]; then + exit 1 +fi + +if [ x"$pcman" == x"zypper -n" ]; then + base_pkg=nodejs10 +else + base_pkg=nodejs +fi + +if [ x"$1" = x"linux" ]; then + pc_install_command "$pcman" + commands[0]="su -c \"$pc_install_command_ret $base_pkg\"" + commands[1]="sudo $pc_install_command_ret $base_pkg" + possible_solutions "${commands[@]}" +fi +} + diff --git a/script/solve_npm b/script/solve_npm new file mode 100755 index 0000000000000000000000000000000000000000..80c466c6a5e7909b5d8ebf0b849a8ded81a041ff --- /dev/null +++ b/script/solve_npm @@ -0,0 +1,24 @@ + +#! /bin/bash + +function solve_npm() { +source script/show_solutions +source script/discover_package_manager +discover_package_manager $1 +pcman=$discover_package_manager_ret + +if [ x"$pcman" = x"" ]; then + exit 1 +fi + +if [ x"$1" = x"osx" ]; then + commands[0]="$pcman install npm" + possible_solutions "${commands[@]}" +elif [ x"$1" = x"linux" ]; then + pc_install_command "$pcman" + commands[0]="su -c \"$pc_install_command_ret npm\"" + commands[1]="sudo $pc_install_command_ret npm" + possible_solutions "${commands[@]}" +fi +} + diff --git a/script/solve_npm_yarn b/script/solve_npm_yarn new file mode 100755 index 0000000000000000000000000000000000000000..126d3423ac3d1fe88ca9e5b30290197141121a90 --- /dev/null +++ b/script/solve_npm_yarn @@ -0,0 +1,12 @@ + +#! /bin/bash + +function solve_npm_yarn() { +source script/show_solutions + +pc_install_command "$pcman" +commands[0]="su -c \"npm install yarn -g\"" +commands[1]="sudo npm install yarn -g" +possible_solutions "${commands[@]}" +} + diff --git a/script/solve_pip3 b/script/solve_pip3 new file mode 100755 index 0000000000000000000000000000000000000000..b3e67bcfdf7c0720f08a0784fa1f06a08f1dcf03 --- /dev/null +++ b/script/solve_pip3 @@ -0,0 +1,29 @@ + +#! /bin/bash + +function solve_pip3() { +source script/show_solutions +source script/discover_package_manager +discover_package_manager $1 +pcman=$discover_package_manager_ret + +if [ x"$pcman" = x"" ]; then + exit 1 +fi + +if [ x"$1" = x"osx" ]; then + commands[0]="$pcman install python3-pip" + possible_solutions "${commands[@]}" +elif [ x"$1" = x"linux" ]; then + if [ x"$pcman" == x"pacman" ]; then + package_name=python-pip + else + package_name=python3-pip + fi + pc_install_command "$pcman" + commands[0]="su -c \"$pc_install_command_ret $package_name\"" + commands[1]="sudo $pc_install_command_ret $package_name" + possible_solutions "${commands[@]}" +fi +} + diff --git a/script/solve_pip3_nox b/script/solve_pip3_nox new file mode 100755 index 0000000000000000000000000000000000000000..69a6faf88ec7b9190a6b4d2901beb9c4838bc653 --- /dev/null +++ b/script/solve_pip3_nox @@ -0,0 +1,12 @@ + +#! /bin/bash + +function solve_pip3_nox() { +source script/show_solutions + + +commands[0]="su -c \"pip3 install nox\"" +commands[1]="sudo pip3 install nox" +possible_solutions "${commands[@]}" +} + diff --git a/script/solve_python b/script/solve_python index 59afdc4c88a26360d3827938f18944a959a76ffd..1093519b73e8d3d3ff468ae3a948989b3aed836c 100755 --- a/script/solve_python +++ b/script/solve_python @@ -10,11 +10,12 @@ if [ x"$pcman" = x"" ]; then exit 1 fi + if [ x"$pcman" == x"apt-get" ]; then - apt-cache show python-is-python3 | grep -q "python-is-python3" - if [ $? -eq 0 ]; then - additional_python_pkg="python-is-python3" - fi + apt-cache show python-is-python3 | grep -q "python-is-python3" + if [ $? -eq 0 ]; then + additional_python_pkg="python-is-python3" + fi apt-cache show python | grep -q "python" if [ $? -eq 0 ]; then base_python_pkg="python" @@ -23,10 +24,15 @@ if [ x"$pcman" == x"apt-get" ]; then if [ $? -eq 0 ]; then base_python_pkg="python2 $base_python_pkg" fi +elif [ x"$pcman" == x"yum" ]; then + base_python_pkg="python" +elif [ x"$pcman" == x"zypper -n" ]; then + base_python_pkg=python2 + additional_python_pkg=python3 fi if [ x"$1" = x"linux" ]; then - pc_install_command "$pcman" + pc_install_command "$pcman" commands[0]="su -c \"$pc_install_command_ret $base_python_pkg $additional_python_pkg\"" commands[1]="sudo $pc_install_command_ret $base_python_pkg $additional_python_pkg" possible_solutions "${commands[@]}" diff --git a/script/solve_python_devel b/script/solve_python_devel new file mode 100755 index 0000000000000000000000000000000000000000..29e9c7ab702902103fbaddd5e2edee262fe7329d --- /dev/null +++ b/script/solve_python_devel @@ -0,0 +1,31 @@ +#! /bin/bash + +function solve_python_devel() { +source script/show_solutions +source script/discover_package_manager +discover_package_manager $1 +pcman=$discover_package_manager_ret + +if [ x"$pcman" = x"" ]; then + exit 1 +fi + +if [ x"$pcman" == x"zypper -n" ]; then + if [ ! -f /usr/include/python3.6m/pyconfig.h ]; then + base_python_pkg=python2-devel + additional_python_pkg=python3-devel + else + return + fi +else + return +fi + +if [ x"$1" = x"linux" ]; then + pc_install_command "$pcman" + commands[0]="su -c \"$pc_install_command_ret $base_python_pkg $additional_python_pkg\"" + commands[1]="sudo $pc_install_command_ret $base_python_pkg $additional_python_pkg" + possible_solutions "${commands[@]}" +fi +} + diff --git a/src/Amr/grid_dist_amr.hpp b/src/Amr/grid_dist_amr.hpp index 49e9173039a4b0d3a7316ad13aa0f42c5b9ce409..7c171bdc415092949eedc40c26fcd199d1bb1e17 100644 --- a/src/Amr/grid_dist_amr.hpp +++ b/src/Amr/grid_dist_amr.hpp @@ -145,7 +145,6 @@ class grid_dist_amr<dim,St,T,AMR_IMPL_TRIVIAL,Decomposition,Memory,device_grid> // openfpm::vector<grid_dist_id<dim,St,T,Decomposition,Memory,device_grid>, HeapMemory, - typename memory_traits_lin<grid_dist_id<dim,St,T,Decomposition,Memory,device_grid>>::type, memory_traits_lin, openfpm::grow_policy_identity,STD_VECTOR> gd_array; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7b72bc4d25a17149f8eb6184b2d84e3569874dce..3148669a74322677e37a8342ffde69d6b35562d7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,45 +4,107 @@ add_definitions(-DSCAN_WITH_CUB) ########################### Executables -if(CUDA_FOUND) + +if(CUDA_FOUND OR CUDA_ON_CPU OR HIP_FOUND) set(CUDA_SOURCES - Grid/tests/sgrid_dist_id_gpu_unit_tests.cu - Vector/cuda/vector_dist_gpu_MP_tests.cu - Vector/cuda/vector_dist_cuda_func_test.cu - Decomposition/cuda/decomposition_cuda_tests.cu - Vector/cuda/vector_dist_gpu_unit_tests.cu - ../openfpm_devices/src/memory/CudaMemory.cu - Decomposition/cuda/Domain_icells_cart_unit_test.cu - Amr/tests/amr_base_gpu_unit_tests.cu) + Grid/tests/sgrid_dist_id_gpu_unit_tests.cu + Vector/cuda/vector_dist_gpu_MP_tests.cu + Vector/cuda/vector_dist_cuda_func_test.cu + Decomposition/cuda/decomposition_cuda_tests.cu + Vector/cuda/vector_dist_gpu_unit_tests.cu + Decomposition/cuda/Domain_icells_cart_unit_test.cu + Amr/tests/amr_base_gpu_unit_tests.cu) +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + add_definitions("-DBOOST_MPL_CFG_HAS_TYPEOF") +endif() + +if (CUDA_ON_CPU) + add_definitions(-DCUDA_ON_CPU) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES COMPILE_FLAGS "-D__NVCC__ -DCUDART_VERSION=11000") + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + add_definitions("-x c++") + endif() +endif() + +if ( HIP_ENABLE AND HIP_FOUND ) + + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HIPCC_FLAGS -O0) + endif() + + list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) + + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + + hip_add_executable(pdata ${CUDA_SOURCES} ${OPENFPM_INIT_FILE} main.cpp + Amr/grid_dist_amr_unit_tests.cpp + Amr/tests/amr_base_unit_tests.cpp + Debug/debug_test.cpp + Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp + Grid/tests/grid_dist_id_unit_test.cpp + Grid/tests/sgrid_dist_id_unit_tests.cpp + Grid/tests/grid_dist_id_dlb_unit_test.cpp + Grid/tests/staggered_grid_dist_unit_test.cpp + Vector/tests/vector_dist_cell_list_tests.cpp + Vector/tests/vector_dist_complex_prp_unit_test.cpp + Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp + Vector/tests/vector_dist_MP_unit_tests.cpp + Vector/tests/vector_dist_NN_tests.cpp + Vector/tests/vector_dist_unit_test.cpp + pdata_performance.cpp + Decomposition/tests/CartDecomposition_unit_test.cpp + Decomposition/tests/shift_vect_converter_tests.cpp + Vector/performance/vector_dist_performance_util.cpp + lib/pdata.cpp + test_multiple_o.cpp + ) + + + hip_add_library(ofpm_pdata STATIC lib/pdata.cpp) + else() - set(CUDA_SOURCES Vector/vector_dist_subset.hpp) + + add_executable(pdata ${OPENFPM_INIT_FILE} ${CUDA_SOURCES} main.cpp + Amr/grid_dist_amr_unit_tests.cpp + Amr/tests/amr_base_unit_tests.cpp + Debug/debug_test.cpp + Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp + Grid/tests/grid_dist_id_unit_test.cpp + Grid/tests/sgrid_dist_id_unit_tests.cpp + Grid/tests/grid_dist_id_dlb_unit_test.cpp + Grid/tests/staggered_grid_dist_unit_test.cpp + Vector/tests/vector_dist_cell_list_tests.cpp + Vector/tests/vector_dist_complex_prp_unit_test.cpp + Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp + Vector/tests/vector_dist_MP_unit_tests.cpp + Vector/tests/vector_dist_NN_tests.cpp + Vector/tests/vector_dist_unit_test.cpp + pdata_performance.cpp + Decomposition/tests/CartDecomposition_unit_test.cpp + Decomposition/tests/shift_vect_converter_tests.cpp + Vector/performance/vector_dist_performance_util.cpp + lib/pdata.cpp test_multiple_o.cpp) + + add_library(ofpm_pdata STATIC lib/pdata.cpp) + endif() -add_executable(pdata ${OPENFPM_INIT_FILE} ${CUDA_SOURCES} main.cpp - Debug/debug_test.cpp - Amr/grid_dist_amr_unit_tests.cpp - Amr/tests/amr_base_unit_tests.cpp - Debug/debug_test.cpp - Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp - Grid/tests/grid_dist_id_unit_test.cpp - Grid/tests/sgrid_dist_id_unit_tests.cpp - Grid/tests/grid_dist_id_dlb_unit_test.cpp - Grid/tests/staggered_grid_dist_unit_test.cpp - Vector/tests/vector_dist_cell_list_tests.cpp - Vector/tests/vector_dist_complex_prp_unit_test.cpp - Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp - Vector/tests/vector_dist_MP_unit_tests.cpp - Vector/tests/vector_dist_NN_tests.cpp - Vector/tests/vector_dist_unit_test.cpp - pdata_performance.cpp - Decomposition/tests/CartDecomposition_unit_test.cpp - Decomposition/tests/shift_vect_converter_tests.cpp - Vector/performance/vector_dist_performance_util.cpp - lib/pdata.cpp test_multiple_o.cpp - ../openfpm_devices/src/memory/HeapMemory.cpp - ../openfpm_devices/src/memory/PtrMemory.cpp - ../openfpm_vcluster/src/VCluster/VCluster.cpp - ) +add_dependencies(pdata ofpmmemory) +add_dependencies(pdata vcluster) + +#add_executable(isolation_pdata ${OPENFPM_INIT_FILE} isolation.cu +# lib/pdata.cpp +# ../openfpm_devices/src/memory/HeapMemory.cpp +# ../openfpm_devices/src/memory/CudaMemory.cu +# ../openfpm_devices/src/memory/PtrMemory.cpp +# ../openfpm_vcluster/src/VCluster/VCluster.cpp +# ) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") target_compile_options(pdata PRIVATE "-Wno-undefined-var-template") @@ -59,13 +121,17 @@ if ( CMAKE_COMPILER_IS_GNUCC ) endif() endif() +if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang") + add_definitions(-D__STRICT_ANSI__) +endif() + if (ENABLE_ASAN) target_compile_options(pdata PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g" >) target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g >) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer") + add_definitions(-DENABLE_ASAN) endif() -add_library(ofpm_pdata STATIC lib/pdata.cpp) add_test(NAME pdata_3_proc COMMAND mpirun -np 3 ./pdata) add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata) @@ -73,14 +139,10 @@ add_test(NAME pdata_4_proc COMMAND mpirun -np 4 ./pdata) ########################### if (CUDA_FOUND) - target_compile_options(pdata PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >) target_include_directories (pdata PUBLIC ${MPI_C_INCLUDE_DIRS}) if (TEST_COVERAGE) target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >) endif() - if (CMAKE_BUILD_TYPE STREQUAL "Debug") - # target_compile_options(pdata PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -G>) - endif() endif() if(TEST_PERFORMANCE) @@ -88,22 +150,36 @@ if(TEST_PERFORMANCE) endif() target_include_directories (pdata PUBLIC ${PARMETIS_ROOT}/include) target_include_directories (pdata PUBLIC ${METIS_ROOT}/include) +target_include_directories (pdata PUBLIC ${HDF5_ROOT}/include) target_include_directories (pdata PUBLIC ${CUDA_INCLUDE_DIRS}) -target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/) -target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/) -target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/) -target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/) -target_include_directories (pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config) +target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/) +target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/) +target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/) +target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/) +target_include_directories (pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/config) target_include_directories (pdata PUBLIC ${PETSC_INCLUDES}) -target_include_directories (pdata PUBLIC ${HDF5_ROOT}/include) target_include_directories (pdata PUBLIC ${LIBHILBERT_INCLUDE_DIRS}) -if(EIGEN3_FOUND) - target_include_directories (pdata PUBLIC ${EIGEN3_INCLUDE_DIR}) -endif() - -target_include_directories(pdata PUBLIC ${Vc_INCLUDE_DIR}) +target_include_directories (pdata PUBLIC ${ALPAKA_ROOT}/include) +target_include_directories (pdata PUBLIC ${Vc_INCLUDE_DIR}) target_include_directories (pdata PUBLIC ${Boost_INCLUDE_DIRS}) +target_include_directories (pdata PUBLIC ${MPI_C_INCLUDE_DIRS}) + +#target_include_directories (isolation_pdata PUBLIC ${PARMETIS_ROOT}/include) +#target_include_directories (isolation_pdata PUBLIC ${METIS_ROOT}/include) +#target_include_directories (isolation_pdata PUBLIC ${CUDA_INCLUDE_DIRS}) +#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/) +#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_vcluster/src/) +#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/) +#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_io/src/) +#target_include_directories (isolation_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config) +#target_include_directories (isolation_pdata PUBLIC ${PETSC_INCLUDES}) +#target_include_directories (isolation_pdata PUBLIC ${HDF5_ROOT}/include) +#target_include_directories (isolation_pdata PUBLIC ${LIBHILBERT_INCLUDE_DIRS}) +#target_include_directories (isolation_pdata PUBLIC ${Vc_INCLUDE_DIR}) +#target_include_directories (isolation_pdata PUBLIC ${Boost_INCLUDE_DIRS}) + target_link_libraries(pdata ${Boost_LIBRARIES}) target_link_libraries(pdata ${PARMETIS_LIBRARIES}) @@ -112,6 +188,19 @@ target_link_libraries(pdata ${HDF5_LIBRARIES}) target_link_libraries(pdata -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES}) target_link_libraries(pdata ${PETSC_LIBRARIES}) target_link_libraries(pdata ${Vc_LIBRARIES}) +target_link_libraries(pdata ${alpaka_LIBRARIES}) +target_link_libraries(pdata ${MPI_C_LIBRARIES}) +target_link_libraries(pdata ${MPI_CXX_LIBRARIES}) +target_link_libraries(pdata vcluster) +target_link_libraries(pdata ofpmmemory) + +#target_link_libraries(isolation_pdata ${Boost_LIBRARIES}) +#target_link_libraries(isolation_pdata ${PARMETIS_LIBRARIES}) +#target_link_libraries(isolation_pdata -L${METIS_ROOT}/lib metis) +#target_link_libraries(isolation_pdata ${HDF5_LIBRARIES}) +#target_link_libraries(isolation_pdata -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES}) +#target_link_libraries(isolation_pdata ${PETSC_LIBRARIES}) +#target_link_libraries(isolation_pdata ${Vc_LIBRARIES}) if (TEST_PERFORMANCE) target_link_libraries(pdata ${Boost_FILESYSTEM_LIBRARY}) @@ -125,19 +214,13 @@ if (TEST_COVERAGE) target_link_libraries(pdata -lgcov --coverage) endif() -# Debug! -# Hack found at https://github.com/LLNL/scr/issues/130#issuecomment-402815952 -IF(MPI_CXX_FOUND) - INCLUDE_DIRECTORIES(${MPI_CXX_INCLUDE_PATH}) -# LIST(APPEND SCR_EXTERNAL_LIBS ${MPI_CXX_LIBRARIES}) - target_link_libraries(pdata ${MPI_CXX_LIBRARIES}) -ENDIF(MPI_CXX_FOUND) + target_include_directories (ofpm_pdata PUBLIC ${CUDA_INCLUDE_DIRS}) -target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/) -target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config) -target_include_directories (ofpm_pdata PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/) +target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_data/src/) +target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/config) +target_include_directories (ofpm_pdata PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../openfpm_devices/src/) target_include_directories (ofpm_pdata PUBLIC ${Boost_INCLUDE_DIRS}) target_compile_definitions(pdata PRIVATE ${MPI_VENDOR}) @@ -151,7 +234,7 @@ endif() # Request that particles be built with -std=c++11 # As this is a public compile feature anything that links to particles # will also build with -std=c++11 -target_compile_features(pdata PUBLIC cxx_std_11) +target_compile_features(pdata PUBLIC cxx_std_14) target_link_libraries(pdata ${MPI_C_LIBRARIES}) target_link_libraries(pdata m) target_link_libraries(pdata c) @@ -159,6 +242,7 @@ if (NOT APPLE) target_link_libraries(pdata rt) endif () + install(FILES Decomposition/CartDecomposition.hpp Decomposition/Domain_icells_cart.hpp Decomposition/shift_vect_converter.hpp @@ -170,7 +254,8 @@ install(FILES Decomposition/CartDecomposition.hpp Decomposition/nn_processor.hpp Decomposition/ie_loc_ghost.hpp Decomposition/ORB.hpp Decomposition/dec_optimizer.hpp - DESTINATION openfpm_pdata/include/Decomposition/ ) + DESTINATION openfpm_pdata/include/Decomposition/ + COMPONENT OpenFPM) install(FILES Decomposition/Distribution/metis_util.hpp Decomposition/Distribution/SpaceDistribution.hpp @@ -178,13 +263,15 @@ install(FILES Decomposition/Distribution/metis_util.hpp Decomposition/Distribution/parmetis_util.hpp Decomposition/Distribution/MetisDistribution.hpp Decomposition/Distribution/ParMetisDistribution.hpp - Decomposition/Distribution/DistParMetisDistribution.hpp - DESTINATION openfpm_pdata/include/Decomposition/Distribution ) + Decomposition/Distribution/DistParMetisDistribution.hpp + Decomposition/Distribution/BoxDistribution.hpp + DESTINATION openfpm_pdata/include/Decomposition/Distribution + COMPONENT OpenFPM) install(FILES Decomposition/cuda/ie_ghost_gpu.cuh Decomposition/cuda/CartDecomposition_gpu.cuh - DESTINATION openfpm_pdata/include/Decomposition/cuda ) - + DESTINATION openfpm_pdata/include/Decomposition/cuda + COMPONENT OpenFPM) install(FILES Grid/grid_dist_id.hpp Grid/grid_dist_id_comm.hpp @@ -193,23 +280,27 @@ install(FILES Grid/grid_dist_id.hpp Grid/staggered_dist_grid.hpp Grid/staggered_dist_grid_util.hpp Grid/staggered_dist_grid_copy.hpp - DESTINATION openfpm_pdata/include/Grid/ ) + DESTINATION openfpm_pdata/include/Grid/ + COMPONENT OpenFPM) install(FILES Grid/cuda/grid_dist_id_kernels.cuh Grid/cuda/grid_dist_id_iterator_gpu.cuh - DESTINATION openfpm_pdata/include/Grid/cuda/ ) + DESTINATION openfpm_pdata/include/Grid/cuda/ + COMPONENT OpenFPM) install(FILES Amr/grid_dist_amr_key_iterator.hpp Amr/grid_dist_amr_key.hpp Amr/grid_dist_amr.hpp - DESTINATION openfpm_pdata/include/Amr/ ) + DESTINATION openfpm_pdata/include/Amr/ + COMPONENT OpenFPM) install(FILES Grid/Iterators/grid_dist_id_iterator_util.hpp Grid/Iterators/grid_dist_id_iterator_dec.hpp Grid/Iterators/grid_dist_id_iterator_dec_skin.hpp Grid/Iterators/grid_dist_id_iterator_sub.hpp Grid/Iterators/grid_dist_id_iterator.hpp - DESTINATION openfpm_pdata/include/Grid/Iterators ) + DESTINATION openfpm_pdata/include/Grid/Iterators + COMPONENT OpenFPM) install(FILES Vector/se_class3_vector.hpp @@ -219,43 +310,134 @@ install(FILES Vector/se_class3_vector.hpp Vector/vector_dist_key.hpp Vector/vector_dist_kernel.hpp Vector/vector_dist_subset.hpp - DESTINATION openfpm_pdata/include/Vector ) + DESTINATION openfpm_pdata/include/Vector + COMPONENT OpenFPM) install(FILES util/common_pdata.hpp - DESTINATION openfpm_pdata/include/util) + DESTINATION openfpm_pdata/include/util + COMPONENT OpenFPM) install(FILES Vector/Iterators/vector_dist_iterator.hpp - DESTINATION openfpm_pdata/include/Vector/Iterators/ ) + DESTINATION openfpm_pdata/include/Vector/Iterators/ + COMPONENT OpenFPM) install(FILES Vector/util/vector_dist_funcs.hpp - DESTINATION openfpm_pdata/include/Vector/util ) + DESTINATION openfpm_pdata/include/Vector/util + COMPONENT OpenFPM) install(FILES Vector/cuda/vector_dist_comm_util_funcs.cuh Vector/cuda/vector_dist_cuda_funcs.cuh Vector/cuda/vector_dist_operators_list_ker.hpp - DESTINATION openfpm_pdata/include/Vector/cuda ) + DESTINATION openfpm_pdata/include/Vector/cuda + COMPONENT OpenFPM) install(FILES Graph/ids.hpp Graph/dist_map_graph.hpp Graph/DistGraphFactory.hpp - DESTINATION openfpm_pdata/include/Graph ) + DESTINATION openfpm_pdata/include/Graph + COMPONENT OpenFPM) install(FILES example.mk SubdomainGraphNodes.hpp DESTINATION openfpm_pdata/include/ ) install(FILES DLB/DLB.hpp DLB/LB_Model.hpp - DESTINATION openfpm_pdata/include/DLB ) + DESTINATION openfpm_pdata/include/DLB + COMPONENT OpenFPM) install(FILES config/config.h - DESTINATION openfpm_pdata/include/config ) + DESTINATION openfpm_pdata/include/config + COMPONENT OpenFPM) install(FILES lib/pdata.hpp - DESTINATION openfpm_pdata/include/lib ) + DESTINATION openfpm_pdata/include/lib + COMPONENT OpenFPM) install(FILES Debug/debug.hpp - DESTINATION openfpm_pdata/include/Debug ) + DESTINATION openfpm_pdata/include/Debug + COMPONENT OpenFPM) + +install(TARGETS ofpm_pdata EXPORT ofpm_pdata_config DESTINATION openfpm_pdata/lib COMPONENT OpenFPM) + +########## Create openfpmConfig.cmake + openfpmConfigVersion.cmake + +add_library(binary_config INTERFACE) + +target_include_directories( + binary_config + INTERFACE + ${CMAKE_INSTALL_PREFIX}/openfpm_pdata/include + ${CMAKE_INSTALL_PREFIX}/openfpm_data/include + ${CMAKE_INSTALL_PREFIX}/openfpm_pdata/include/config + ${CMAKE_INSTALL_PREFIX}/openfpm_io/include + ${CMAKE_INSTALL_PREFIX}/openfpm_vcluster/include + ${CMAKE_INSTALL_PREFIX}/openfpm_devices/include + ${CMAKE_INSTALL_PREFIX}/openfpm_numerics/include + ${PARMETIS_ROOT}/include + ${METIS_ROOT}/include + ${CUDA_INCLUDE_DIRS} + ${PETSC_INCLUDES} + ${HDF5_ROOT}/include + ${LIBHILBERT_INCLUDE_DIRS} + ${Vc_INCLUDE_DIR} + ${Boost_INCLUDE_DIRS} + ) + +if (CUDA_ON_CPU) + target_compile_definitions(binary_config INTERFACE CUDA_ON_CPU) +endif() -install(TARGETS ofpm_pdata DESTINATION openfpm_pdata/lib) +target_compile_options(binary_config INTERFACE $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >) + +target_link_libraries(binary_config INTERFACE ${Boost_LIBRARIES}) +target_link_libraries(binary_config INTERFACE ${PARMETIS_LIBRARIES}) +target_link_libraries(binary_config INTERFACE -L${METIS_ROOT}/lib metis) +target_link_libraries(binary_config INTERFACE ${HDF5_LIBRARIES}) +target_link_libraries(binary_config INTERFACE -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES}) +target_link_libraries(binary_config INTERFACE ${PETSC_LIBRARIES}) +target_link_libraries(binary_config INTERFACE ${Vc_LIBRARIES}) +target_link_libraries(binary_config INTERFACE ${alpaka_LIBRARIES}) +target_link_libraries(binary_config INTERFACE ${MPI_C_LIBRARIES}) + +# Not OK before CMake 3.13 +#target_link_libraries(binary_config INTERFACE $<INSTALL_PREFIX>/openfpm_vcluster/lib/$<TARGET_FILE_NAME:openfpm::vcluster> ) +#target_link_libraries(binary_config INTERFACE $<INSTALL_PREFIX>/openfpm_devices/lib/$<TARGET_FILE_NAME:openfpm::ofpmmemory> ) +target_link_libraries(binary_config INTERFACE ${CMAKE_INSTALL_PREFIX}/openfpm_vcluster/lib/libvcluster.a ) +target_link_libraries(binary_config INTERFACE ${CMAKE_INSTALL_PREFIX}/openfpm_devices/lib/libofpmmemory.a ) +target_link_libraries(binary_config INTERFACE ${CMAKE_INSTALL_PREFIX}/openfpm_pdata/lib/libofpm_pdata.a ) +target_link_libraries(binary_config INTERFACE ${CUDA_LIBRARIES} ) + +# Does not work before Cmake 3.13 +#install(TARGETS binary_config vcluster ofpmmemory EXPORT openfpm_config CONFIGURATIONS) +install(TARGETS binary_config EXPORT openfpm_config CONFIGURATIONS) + +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/openfpm_cmake/openfpmConfigVersion.cmake" + VERSION ${openfpm_VERSION} + COMPATIBILITY AnyNewerVersion +) + + +install(EXPORT openfpm_config + DESTINATION cmake/ + NAMESPACE openfpm:: + FILE openfpmConfig.cmake) + +install( + FILES + "${CMAKE_CURRENT_BINARY_DIR}/openfpm_cmake/openfpmConfigVersion.cmake" + DESTINATION + cmake/ +) + +install( + FILES + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/openfpmConfigVars-configure.cmake" + DESTINATION + cmake/ +) + +##################################################################### #if(BUILD_TESTING) diff --git a/src/DLB/DLB.hpp b/src/DLB/DLB.hpp index fa04bc538957d865c13ed23ce73f66878917d193..cb72e0c203b5185d6a13985497284228af983b2d 100644 --- a/src/DLB/DLB.hpp +++ b/src/DLB/DLB.hpp @@ -9,7 +9,7 @@ #define SRC_DECOMPOSITION_DLB_HPP_ //! Time structure for statistical purposes -typedef struct +struct Times { //! starting time of the simulation (0) size_t simulationStartTime = 0; @@ -27,7 +27,7 @@ typedef struct //! End time size_t iterationEndTime; -} Times; +}; /*! Class that implements the two heuristics to determine when a re-balance of the distribution is needed. * diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp index c43d2a416bfce37f31d624f08443fdec2e59fcc5..7c52034ff6e7b6af27fa65cd5be945b8938d8e04 100755 --- a/src/Decomposition/CartDecomposition.hpp +++ b/src/Decomposition/CartDecomposition.hpp @@ -43,6 +43,12 @@ #define CARTDEC_ERROR 2000lu +enum dec_options +{ + DEC_NONE = 0, + DEC_SKIP_ICELL = 1 +}; + /*! \brief It spread the sub-sub-domain on a regular cartesian grid of size dim * * \warning this function only guarantee that the division on each direction is @@ -163,16 +169,15 @@ protected: //! acc_key is size_t typedef typename openfpm::vector<SpaceBox<dim, T>, Memory, - typename memory_traits_lin<SpaceBox<dim, T>>::type, memory_traits_lin, openfpm::vector_grow_policy_default, openfpm::vect_isel<SpaceBox<dim, T>>::value>::access_key acc_key; //! the set of all local sub-domain as vector - openfpm::vector<SpaceBox<dim, T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> sub_domains; + openfpm::vector<SpaceBox<dim, T>,Memory,layout_base> sub_domains; //! the remote set of all sub-domains as vector of 'sub_domains' vectors - mutable openfpm::vector<Box_map<dim, T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> sub_domains_global; + mutable openfpm::vector<Box_map<dim, T>,Memory,layout_base> sub_domains_global; //! for each sub-domain, contain the list of the neighborhood processors openfpm::vector<openfpm::vector<long unsigned int> > box_nn_processor; @@ -277,14 +282,14 @@ protected: return sub_d; } - void collect_all_sub_domains(openfpm::vector<Box_map<dim,T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> & sub_domains_global) + void collect_all_sub_domains(openfpm::vector<Box_map<dim,T>,Memory,layout_base> & sub_domains_global) { #ifdef SE_CLASS2 check_valid(this,8); #endif sub_domains_global.clear(); - openfpm::vector<Box_map<dim,T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> bm; + openfpm::vector<Box_map<dim,T>,Memory,layout_base> bm; for (size_t i = 0 ; i < sub_domains.size() ; i++) { @@ -1345,7 +1350,7 @@ public: /*! \brief Start decomposition * */ - void decompose() + void decompose(dec_options opt = dec_options::DEC_NONE) { reset(); @@ -1361,13 +1366,17 @@ public: domain_nn_calculator_cart<dim>::reset(); domain_nn_calculator_cart<dim>::setParameters(proc_box); - domain_icell_calculator<dim,T,layout_base,Memory> - ::CalculateInternalCells(v_cl, + if (opt != dec_options::DEC_SKIP_ICELL) + { + + domain_icell_calculator<dim,T,layout_base,Memory> + ::CalculateInternalCells(v_cl, ie_ghost<dim, T,Memory,layout_base>::private_get_vb_int_box(), sub_domains, this->getProcessorBounds(), this->getGhost().getRcut(), this->getGhost()); + } } /*! \brief Refine the decomposition, available only for ParMetis distribution, for Metis it is a null call @@ -1577,7 +1586,7 @@ public: return domain; } - const openfpm::vector<SpaceBox<dim, T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & + const openfpm::vector<SpaceBox<dim, T>,Memory,layout_base> & getSubDomains() const { return sub_domains; @@ -1826,7 +1835,7 @@ public: bool write(std::string output) const { //! subdomains_X.vtk domain for the local processor (X) as union of sub-domain - VTKWriter<openfpm::vector<SpaceBox<dim, T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base>, VECTOR_BOX> vtk_box1; + VTKWriter<openfpm::vector<SpaceBox<dim, T>,Memory,layout_base>, VECTOR_BOX> vtk_box1; vtk_box1.add(sub_domains); vtk_box1.write(output + std::string("subdomains_") + std::to_string(v_cl.getProcessUnitID()) + std::string(".vtk")); @@ -2175,7 +2184,7 @@ public: * \return sub_domains_global * */ - openfpm::vector<Box_map<dim, T>,Memory,typename layout_base<Box_map<dim, T>>::type,layout_base> & private_get_sub_domains_global() + openfpm::vector<Box_map<dim, T>,Memory,layout_base> & private_get_sub_domains_global() { return sub_domains_global; } diff --git a/src/Decomposition/Distribution/BoxDistribution.hpp b/src/Decomposition/Distribution/BoxDistribution.hpp index a1a2265270928caaf1a1bfcec404683bead1eec6..13535c5bade04a2c27d101847ed81ad5e79501b0 100644 --- a/src/Decomposition/Distribution/BoxDistribution.hpp +++ b/src/Decomposition/Distribution/BoxDistribution.hpp @@ -228,7 +228,8 @@ public: getPrimeFactors(v_cl.size(),facts); size_t div[dim]; - size_t ln[dim]; + size_t ln[dim]; + double ln_d[dim]; for (int i = 0 ; i < dim ; i++) {div[i] = 1;} @@ -237,7 +238,10 @@ public: {div[i % dim] *= facts.get(i);} for (int i = 0 ; i < dim ; i++) - {ln[i] = gr.size(i) / div[i];} + { + ln[i] = gr.size(i) / div[i]; + ln_d[i] = (double)gr.size(i) / div[i]; + } grid_sm<dim,void> gr_proc(div); @@ -249,10 +253,10 @@ public: for (int i = 0 ; i < dim ; i++) { - key_prc.set_d(i,key.get(i)/ln[i]); - if (key_prc.get(i) >= div[i]) - {key_prc.set_d(i,div[i]-1);} - } + key_prc.set_d(i,key.get(i)/ln_d[i]); + if (key_prc.get(i) >= div[i]) + {key_prc.set_d(i,div[i]-1);} + } size_t i = gr.LinId(key); diff --git a/src/Decomposition/Distribution/Distribution_unit_tests.hpp b/src/Decomposition/Distribution/Distribution_unit_tests.hpp index ac8d35ea74113e2f656affd7b8e39020c99a48c3..4836918c9e2c5a6faed7ad14dfec9fb7ec9d6728 100644 --- a/src/Decomposition/Distribution/Distribution_unit_tests.hpp +++ b/src/Decomposition/Distribution/Distribution_unit_tests.hpp @@ -11,6 +11,7 @@ #include "config.h" #include "SpaceDistribution.hpp" #include <unistd.h> +#include "BoxDistribution.hpp" /*! \brief Set a sphere as high computation cost * @@ -423,6 +424,53 @@ BOOST_AUTO_TEST_CASE( Space_distribution_test) } +BOOST_AUTO_TEST_CASE( Box_distribution_test) +{ + Vcluster<> & v_cl = create_vcluster(); + + if (v_cl.size() > 16) + {return;} + + //! [Initialize a ParMetis Cartesian graph and decompose] + + BoxDistribution<3, float> box_dist(v_cl); + + // Physical domain + Box<3, float> box( { 0.0, 0.0, 0.0 }, { 10.0, 10.0, 10.0 }); + + // Grid info + grid_sm<3, void> info( { GS_SIZE, GS_SIZE, GS_SIZE }); + + // Initialize Cart graph and decompose + box_dist.createCartGraph(info,box); + + // First create the center of the weights distribution, check it is coherent to the size of the domain + Point<3, float> center( { 2.0, 2.0, 2.0 }); + + // first decomposition + box_dist.decompose(); + + BOOST_REQUIRE_EQUAL(box_dist.get_ndec(),0ul); + + auto & graph = box_dist.getGraph(); + + for (int i = 0 ; i < graph.getNVertex() ; i++) + { + BOOST_REQUIRE(graph.vertex(i).template get<nm_v_proc_id>() < v_cl.size()); + } + + size_t n_sub = box_dist.getNOwnerSubSubDomains(); + + size_t n_sub_tot = info.size(); + size_t n_sub_bal = n_sub_tot / v_cl.size(); + + BOOST_REQUIRE( (((int)n_sub_bal - 64) <= (long int)n_sub) && (n_sub_bal + 64 >= n_sub) ); + + //! [refine with parmetis the decomposition] + +// BOOST_REQUIRE_EQUAL(sizeof(ParMetisDistribution<3,float>),872ul); +} + BOOST_AUTO_TEST_SUITE_END() #endif /* SRC_DECOMPOSITION_DISTRIBUTION_DISTRIBUTION_UNIT_TESTS_HPP_ */ diff --git a/src/Decomposition/Distribution/metis_util.hpp b/src/Decomposition/Distribution/metis_util.hpp index 92ceb7a9218f4fece6775988679c40ac7576c097..fd7d8273757bad531c28ed58f06a8b6abc468187 100644 --- a/src/Decomposition/Distribution/metis_util.hpp +++ b/src/Decomposition/Distribution/metis_util.hpp @@ -133,6 +133,16 @@ class Metis void constructAdjListWithWeights(Graph & g) { // create xadj, adjlist, vwgt, adjwgt and vsize + if (Mg.xadj != NULL) + {delete [] Mg.xadj;} + if (Mg.adjncy != NULL) + {delete [] Mg.adjncy;} + if (Mg.vwgt != NULL) + {delete [] Mg.vwgt;} + if (Mg.adjwgt != NULL) + {delete [] Mg.adjwgt;} + if (Mg.vsize != NULL) + {delete [] Mg.vsize;} Mg.xadj = new idx_t[g.getNVertex() + 1]; Mg.adjncy = new idx_t[g.getNEdge()]; Mg.vwgt = new idx_t[g.getNVertex()]; @@ -387,6 +397,10 @@ public: { delete[] Mg.part; } + if (Mg.vsize != NULL) + { + delete[] Mg.vsize; + } } /*! \brief Decompose the graph diff --git a/src/Decomposition/Domain_icells_cart.hpp b/src/Decomposition/Domain_icells_cart.hpp index acd9837e0392c07d0fe94a4f6e144978ab1e1a9d..e2aa69184795b0141a37b7484a64efe9143553af 100644 --- a/src/Decomposition/Domain_icells_cart.hpp +++ b/src/Decomposition/Domain_icells_cart.hpp @@ -26,16 +26,20 @@ __global__ void insert_icell(vector_sparse_type vs, CellDecomposer_type cld, gri unsigned int b = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; + bool out = false; for (unsigned int i = 0 ; i < dim ; i++) { gk.set_d(i,gk.get(i) + start.get(i)); if (gk.get(i) > stop.get(i)) - {return;} + {out = true;} } - auto id = cld.LinId(gk); + if (out == false) + { + auto id = cld.LinId(gk); - vs.insert_b(id,b); + vs.insert_b(id,b); + } vs.flush_block_insert(b, threadIdx.x == 0 & threadIdx.y == 0 & threadIdx.z == 0 ); } @@ -50,17 +54,21 @@ __global__ void insert_remove_icell(vector_sparse_type vs, vector_sparse_type vs unsigned int b = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; + bool out = false; for (unsigned int i = 0 ; i < dim ; i++) { gk.set_d(i,gk.get(i) + start.get(i)); if (gk.get(i) > stop.get(i)) - {return;} + {out = true;} } - auto id = cld.LinId(gk); + if (out == false) + { + auto id = cld.LinId(gk); - vs.insert_b(id,b); - vsi.remove_b(id,b); + vs.insert_b(id,b); + vsi.remove_b(id,b); + } vs.flush_block_insert(b, threadIdx.x == 0 & threadIdx.y == 0 & threadIdx.z == 0 ); vsi.flush_block_remove(b, threadIdx.x == 0 & threadIdx.y == 0 & threadIdx.z == 0); @@ -71,14 +79,14 @@ struct CalculateInternalCells_impl { template<typename VCluster_type> static void CalculateInternalCells(VCluster_type & v_cl, - openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & ig_box, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim,T>>::type,layout_base> & domain, + openfpm::vector<Box<dim,T>,Memory,layout_base> & ig_box, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & domain, Box<dim,T> & pbox, T r_cut, const Ghost<dim,T> & enlarge, CellDecomposer_sm<dim,T,shift<dim,T>> & cd, - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & icells, - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & dcells) + openfpm::vector<aggregate<ids_type>,Memory,layout_base> & icells, + openfpm::vector<aggregate<ids_type>,Memory,layout_base> & dcells) { } @@ -89,16 +97,16 @@ struct CalculateInternalCells_impl<dim,T,layout_base,Memory,cnt_type,ids_type,tr { template<typename VCluster_type> static void CalculateInternalCells(VCluster_type & v_cl, - openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & ig_box, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim,T>>::type,layout_base> & domain, + openfpm::vector<Box<dim,T>,Memory,layout_base> & ig_box, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & domain, Box<dim,T> & pbox, T r_cut, const Ghost<dim,T> & enlarge, CellDecomposer_sm<dim,T,shift<dim,T>> & cd, - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & icells, - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & dcells) + openfpm::vector<aggregate<ids_type>,Memory,layout_base> & icells, + openfpm::vector<aggregate<ids_type>,Memory,layout_base> & dcells) { -#ifdef __NVCC__ +#if 0 // Division array size_t div[dim]; @@ -186,6 +194,7 @@ struct CalculateInternalCells_impl<dim,T,layout_base,Memory,cnt_type,ids_type,tr vsi.flush_remove(v_cl.getmgpuContext(),flush_type::FLUSH_ON_DEVICE); } + vs.swapIndexVector(icells); vsi.swapIndexVector(dcells); @@ -202,8 +211,8 @@ class domain_icell_calculator typedef int ids_type; - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> icells; - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> dcells; + openfpm::vector<aggregate<ids_type>,Memory,layout_base> icells; + openfpm::vector<aggregate<ids_type>,Memory,layout_base> dcells; CellDecomposer_sm<dim,T,shift<dim,T>> cd; @@ -246,8 +255,8 @@ class domain_icell_calculator */ template<typename VCluster_type> void CalculateInternalCells(VCluster_type & v_cl, - openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & ig_box, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim,T>>::type,layout_base> & domain, + openfpm::vector<Box<dim,T>,Memory,layout_base> & ig_box, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & domain, Box<dim,T> & pbox, T r_cut, const Ghost<dim,T> & enlarge) @@ -262,7 +271,7 @@ class domain_icell_calculator * \return the list of the internal cells * */ - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & getIcells() + openfpm::vector<aggregate<ids_type>,Memory,layout_base> & getIcells() { return icells; } @@ -272,7 +281,7 @@ class domain_icell_calculator * \return the list of the internal cells * */ - openfpm::vector<aggregate<ids_type>,Memory,typename layout_base<aggregate<ids_type>>::type,layout_base> & getDcells() + openfpm::vector<aggregate<ids_type>,Memory,layout_base> & getDcells() { return dcells; } diff --git a/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu b/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu index 5fc8f5db1acd5802bf84beb84b8e76db822c0010..83052d394b248309290cf3a1bd39a98fb4a6bdd4 100644 --- a/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu +++ b/src/Decomposition/cuda/Domain_icells_cart_unit_test.cu @@ -11,6 +11,8 @@ BOOST_AUTO_TEST_SUITE( domain_icells_cart ) BOOST_AUTO_TEST_CASE( domain_icells_use ) { +#if 0 + domain_icell_calculator<3,float,memory_traits_inte,CudaMemory> dcc; openfpm::vector_gpu<SpaceBox<3,float>> domain_proc; @@ -133,6 +135,8 @@ BOOST_AUTO_TEST_CASE( domain_icells_use ) for (size_t i = 0 ; i < dcheck.size() ; i++) {BOOST_REQUIRE_EQUAL(dcheck.template get<0>(i),dc.template get<0>(i));} + + #endif } diff --git a/src/Decomposition/cuda/decomposition_cuda_tests.cu b/src/Decomposition/cuda/decomposition_cuda_tests.cu index e578fe633474de975b30b6c54963cabe3edb68c9..a35dc741d71bab4b61f82831c1f3d41ec9b9298c 100644 --- a/src/Decomposition/cuda/decomposition_cuda_tests.cu +++ b/src/Decomposition/cuda/decomposition_cuda_tests.cu @@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb CudaMemory mem; mem.allocate(2*sizeof(unsigned int)); - test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer()); + CUDA_LAUNCH_DIM3((test_proc_idbc<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem.getDevicePointer()); mem.deviceToHost(); @@ -85,7 +85,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb CudaMemory mem2; mem2.allocate(2*sizeof(unsigned int)); - test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer()); + CUDA_LAUNCH_DIM3((test_ghost_n<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer()); mem2.deviceToHost(); @@ -93,7 +93,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb openfpm::vector_gpu<aggregate<int,int>> vd; vd.resize(tot); - test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel()); + CUDA_LAUNCH_DIM3((test_ghost<decltype(gpudec),decltype(vd.toKernel())>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel()); if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1]) { @@ -119,7 +119,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb p2.get(j) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(j),1.0); - test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer()); + CUDA_LAUNCH_DIM3((test_proc_idbc<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem.getDevicePointer()); mem.deviceToHost(); @@ -127,14 +127,14 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size()); mem2.allocate(2*sizeof(unsigned int)); - test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer()); + CUDA_LAUNCH_DIM3((test_ghost_n<decltype(gpudec)>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer()); mem2.deviceToHost(); tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1]; vd.resize(tot); - test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel()); + CUDA_LAUNCH_DIM3((test_ghost<decltype(gpudec),decltype(vd.toKernel())>),1,1,p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel()); if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1]) { diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp index cca146970240d20b3733fa44ed218103587b1903..2eadea6bad6427ae702d8ffadb48510c5ccf9a6d 100755 --- a/src/Decomposition/ie_ghost.hpp +++ b/src/Decomposition/ie_ghost.hpp @@ -59,18 +59,18 @@ class ie_ghost openfpm::vector<p_box<dim,T> > vb_ext; //! Internal ghost boxes for this processor domain - openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int,unsigned int>>::type,layout_base> vb_int; + openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,layout_base> vb_int; //! Internal ghost boxes for this processor domain - openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> vb_int_box; + openfpm::vector<Box<dim,T>,Memory,layout_base> vb_int_box; //! Cell-list that store the geometrical information of the internal ghost boxes CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> geo_cell; - typedef openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> proc_boxes; + typedef openfpm::vector<Box<dim,T>,Memory,layout_base> proc_boxes; //! shift vectors - openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> shifts; + openfpm::vector<Point<dim,T>,Memory,layout_base> shifts; //! Temporal buffers to return temporal information for ghost_processorID openfpm::vector<std::pair<size_t,size_t>> ids_p; @@ -245,7 +245,7 @@ protected: */ void create_box_nn_processor_ext(Vcluster<> & v_cl, Ghost<dim,T> & ghost, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains, const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor, const nn_prcs<dim,T,layout_base,Memory> & nn_p) { @@ -347,7 +347,7 @@ protected: */ void create_box_nn_processor_int(Vcluster<> & v_cl, Ghost<dim,T> & ghost, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains, const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor, const nn_prcs<dim,T,layout_base,Memory> & nn_p) { @@ -659,7 +659,7 @@ public: * \return the shift vectors * */ - const openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & getShiftVectors() + const openfpm::vector<Point<dim,T>,Memory,layout_base> & getShiftVectors() { if (host_dev_transfer == false) { @@ -1302,7 +1302,7 @@ public: * \return vb_int * */ - inline openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int,unsigned int>>::type,layout_base> & + inline openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,layout_base> & private_get_vb_int() { return vb_int; @@ -1313,7 +1313,7 @@ public: * \return vb_int_box * */ - inline openfpm::vector<Box<dim,T>,Memory,typename layout_base<Box<dim,T>>::type,layout_base> & + inline openfpm::vector<Box<dim,T>,Memory,layout_base> & private_get_vb_int_box() { return vb_int_box; @@ -1335,7 +1335,7 @@ public: * \return shifts * */ - inline openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & + inline openfpm::vector<Point<dim,T>,Memory,layout_base> & private_get_shifts() { return shifts; diff --git a/src/Decomposition/ie_loc_ghost.hpp b/src/Decomposition/ie_loc_ghost.hpp index 2e93329ceabc4c6d30e494df95affeb2a98fe2b9..231828e1e71503d525a371be2bb14760799f134e 100755 --- a/src/Decomposition/ie_loc_ghost.hpp +++ b/src/Decomposition/ie_loc_ghost.hpp @@ -40,7 +40,7 @@ class ie_loc_ghost * */ void create_loc_ghost_ebox(Ghost<dim,T> & ghost, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains, openfpm::vector<Box_loc_sub<dim,T>> & sub_domains_prc) { comb<dim> zero; @@ -101,7 +101,7 @@ class ie_loc_ghost * */ void create_loc_ghost_ibox(Ghost<dim,T> & ghost, - openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains, + openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains, openfpm::vector<Box_loc_sub<dim,T>> & sub_domains_prc) { comb<dim> zero; @@ -249,7 +249,7 @@ public: * \param bc Boundary conditions * */ - void create(openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains, Box<dim,T> & domain , Ghost<dim,T> & ghost , const size_t (&bc)[dim] ) + void create(openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains, Box<dim,T> & domain , Ghost<dim,T> & ghost , const size_t (&bc)[dim] ) { // It will store local sub-domains + borders openfpm::vector<Box_loc_sub<dim,T>> sub_domains_prc; diff --git a/src/Decomposition/nn_processor.hpp b/src/Decomposition/nn_processor.hpp index eaf799e6f75928094253e5dd35a0d4fcc6737c45..9cf4f4e46b98bf448f5faed8c593c73404f0caed 100755 --- a/src/Decomposition/nn_processor.hpp +++ b/src/Decomposition/nn_processor.hpp @@ -431,7 +431,7 @@ public: * */ void create(const openfpm::vector<openfpm::vector<long unsigned int> > & box_nn_processor, - const openfpm::vector<SpaceBox<dim,T>,Memory,typename layout_base<SpaceBox<dim, T>>::type,layout_base> & sub_domains) + const openfpm::vector<SpaceBox<dim,T>,Memory,layout_base> & sub_domains) { // produce the list of the adjacent processor (nn_processors) list for (size_t i = 0 ; i < box_nn_processor.size() ; i++) diff --git a/src/Decomposition/shift_vect_converter.hpp b/src/Decomposition/shift_vect_converter.hpp index f640bbd5fd38f12db84e9efca4a7f5dd0b2b9099..8957e2674812c446673c94b2252fb91775729cc7 100644 --- a/src/Decomposition/shift_vect_converter.hpp +++ b/src/Decomposition/shift_vect_converter.hpp @@ -34,7 +34,7 @@ class shift_vect_converter * */ void generateShiftVectors_ld(const Box<dim,T> & domain, size_t (& bc)[dim], - openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & shifts) + openfpm::vector<Point<dim,T>,Memory,layout_base> & shifts) { shifts.resize(openfpm::math::pow(3,dim)); @@ -71,7 +71,7 @@ class shift_vect_converter * */ void generateShiftVectors_hd(const Box<dim,T> & domain, size_t (& bc)[dim], - openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & shifts) + openfpm::vector<Point<dim,T>,Memory,layout_base> & shifts) { // get the indexes of the free degree of freedom for (size_t i = 0 ; i < dim ; i++) @@ -126,7 +126,7 @@ public: * */ void generateShiftVectors(const Box<dim,T> & domain, size_t (& bc)[dim], - openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & shifts) + openfpm::vector<Point<dim,T>,Memory,layout_base> & shifts) { if (dim < 10) {generateShiftVectors_ld(domain,bc,shifts);} diff --git a/src/Graph/dist_map_graph.hpp b/src/Graph/dist_map_graph.hpp index 202137c93438d8351fcc22f92143570277be7ba6..e040c8983fb9fda3c5e5cd746ea9c9b4cc04d637 100644 --- a/src/Graph/dist_map_graph.hpp +++ b/src/Graph/dist_map_graph.hpp @@ -221,25 +221,25 @@ class DistGraph_CSR size_t v_slot; //! Structure that store the vertex properties - openfpm::vector<V, Memory, layout_v,layout_v_base,grow_p, openfpm::vect_isel<V>::value> v; + openfpm::vector<V, Memory,layout_v_base,grow_p, openfpm::vect_isel<V>::value> v; //! Structure that store the vertex id and global id - openfpm::vector<v_info, Memory, typename memory_traits_lin<v_info>::type, memory_traits_lin, grow_p, openfpm::vect_isel<v_info>::value> v_m; + openfpm::vector<v_info, Memory, memory_traits_lin, grow_p, openfpm::vect_isel<v_info>::value> v_m; //! Structure that store the number of adjacent vertex in e_l for each vertex - openfpm::vector<size_t, Memory, typename layout_v_base<size_t>::type, layout_v_base, grow_p, openfpm::vect_isel<size_t>::value> v_l; + openfpm::vector<size_t, Memory, layout_v_base, grow_p, openfpm::vect_isel<size_t>::value> v_l; //! Structure that store the edge properties - openfpm::vector<E, Memory, layout_e, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e; + openfpm::vector<E, Memory, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e; //! Structure that store the edge properties - openfpm::vector<e_info, Memory, typename layout_e_base<e_info>::type, layout_e_base, grow_p, openfpm::vect_isel<e_info>::value> e_m; + openfpm::vector<e_info, Memory, layout_e_base, grow_p, openfpm::vect_isel<e_info>::value> e_m; //! Structure that store for each vertex the adjacent the vertex id and edge id (for property into e) - openfpm::vector<e_map, Memory, typename memory_traits_lin<e_map>::type, layout_e_base, grow_p, openfpm::vect_isel<e_map>::value> e_l; + openfpm::vector<e_map, Memory, layout_e_base, grow_p, openfpm::vect_isel<e_map>::value> e_l; //! invalid edge element, when a function try to create an in valid edge this object is returned - openfpm::vector<E, Memory, layout_e, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e_invalid; + openfpm::vector<E, Memory, layout_e_base, grow_p, openfpm::vect_isel<E>::value> e_invalid; //! Map to access to the global vertex id given the vertex id std::unordered_map<size_t, size_t> id2glb; @@ -251,7 +251,7 @@ class DistGraph_CSR std::unordered_map<size_t, size_t> glb2loc; //! Struct containing the (sub)graph to send - typedef struct + struct SendGraphPack { //! vertex send buffer openfpm::vector<V> send_v; @@ -267,7 +267,7 @@ class DistGraph_CSR openfpm::vector<size_t> send_es; //! Indicates if the pack is empty or not bool isEmpty = true; - } SendGraphPack; + }; //! Pack storing that data to send to other processors openfpm::vector<SendGraphPack> sgp; @@ -1001,10 +1001,10 @@ public: typedef E E_type; //! Object container for the vertex, for example can be encap<...> (map_grid or openfpm::vector) - typedef typename openfpm::vector<V, Memory, layout_v, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::container V_container; + typedef typename openfpm::vector<V, Memory, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::container V_container; //! Object container for the edge, for example can be encap<...> (map_grid or openfpm::vector) - typedef typename openfpm::vector<E, Memory, layout_e, layout_e_base, grow_p, openfpm::vect_isel<E>::value>::container E_container; + typedef typename openfpm::vector<E, Memory, layout_e_base, grow_p, openfpm::vect_isel<E>::value>::container E_container; /*! \brief It duplicate the graph * @@ -1577,7 +1577,7 @@ public: * \return the number of childs * */ - inline size_t getNChilds(typename openfpm::vector<V, Memory, layout_v, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & c) + inline size_t getNChilds(typename openfpm::vector<V, Memory, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & c) { return v_l.template get<0>(c.get()); } @@ -1695,7 +1695,7 @@ public: * \return the target i connected by an edge node, for the node v * */ - inline size_t getChild(typename openfpm::vector<V, Memory, layout_v, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & v, size_t i) + inline size_t getChild(typename openfpm::vector<V, Memory, layout_v_base, grow_p, openfpm::vect_isel<V>::value>::iterator_key & v, size_t i) { #ifdef DEBUG if (i >= v_l.template get<0>(v.get())) diff --git a/src/Grid/Iterators/grid_dist_id_iterator.hpp b/src/Grid/Iterators/grid_dist_id_iterator.hpp index df83f0add9cca4ee213fb560f02e743efa864770..308567b31fff370d39ac7627b29548b65b03c940 100644 --- a/src/Grid/Iterators/grid_dist_id_iterator.hpp +++ b/src/Grid/Iterators/grid_dist_id_iterator.hpp @@ -45,12 +45,14 @@ struct launch_insert_sparse_lambda_call<3> unsigned int blockId, itd_type itd, coord_type & key, - coord_type & keyg,unsigned int offset, bool & is_block_empty) + coord_type & keyg,unsigned int offset, bool & is_block_empty, + bool is_in) { #ifdef __NVCC__ - bool is_active = f1(keyg.get(0),keyg.get(1),keyg.get(2)); - is_active &= key.get(0) >= itd.start_base.get(0) && key.get(1) >= itd.start_base.get(1) && key.get(2) >= itd.start_base.get(2); + bool is_active = false; + if (is_in == true) + {is_active = f1(keyg.get(0),keyg.get(1),keyg.get(2));} if (is_active == true) {is_block_empty = false;} @@ -85,7 +87,8 @@ struct launch_insert_sparse_lambda_call<3> keyg.set_d(1,key.get(1) + itg.origin.get(1)); keyg.set_d(2,key.get(2) + itg.origin.get(2)); - if (key.get(0) > itg.stop.get(0) || key.get(1) > itg.stop.get(1) || key.get(2) > itg.stop.get(2)) + if (key.get(0) > itg.stop.get(0) || key.get(1) > itg.stop.get(1) || key.get(2) > itg.stop.get(2) || + key.get(0) < itg.start_base.get(0) || key.get(1) < itg.start_base.get(1) || key.get(2) < itg.start_base.get(2)) {return true;} #endif return false; @@ -101,12 +104,14 @@ struct launch_insert_sparse_lambda_call<2> unsigned int blockId, itd_type itd, coord_type & key, - coord_type & keyg,unsigned int offset, bool & is_block_empty) + coord_type & keyg,unsigned int offset, bool & is_block_empty, + bool is_in) { #ifdef __NVCC__ - bool is_active = f1(keyg.get(0),keyg.get(1)); - is_active &= key.get(0) >= itd.start_base.get(0) && key.get(1) >= itd.start_base.get(1); + bool is_active = false; + if (is_in == true) + {is_active = f1(keyg.get(0),keyg.get(1));} if (is_active == true) {is_block_empty = false;} @@ -138,7 +143,8 @@ struct launch_insert_sparse_lambda_call<2> keyg.set_d(0,key.get(0) + itg.origin.get(0)); keyg.set_d(1,key.get(1) + itg.origin.get(1)); - if (key.get(0) > itg.stop.get(0) || key.get(1) > itg.stop.get(1)) + if (key.get(0) > itg.stop.get(0) || key.get(1) > itg.stop.get(1) || + key.get(0) < itg.start_base.get(0) || key.get(1) < itg.start_base.get(1)) {return true;} #endif return false; @@ -155,20 +161,20 @@ struct launch_insert_sparse grid_key_dx<grid_type::dims,int> key; grid_key_dx<grid_type::dims,int> keyg; - if (launch_insert_sparse_lambda_call<grid_type::dims>::set_keys(key,keyg,itg) == true) {return;} + bool not_active = launch_insert_sparse_lambda_call<grid_type::dims>::set_keys(key,keyg,itg); - if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) - {is_block_empty = true;} + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) + {is_block_empty = true;} grid.init(); int offset = 0; - grid_key_dx<grid_type::dims,int> blk; - bool out = grid.template getInsertBlockOffset<ite_type>(itg,key,blk,offset); + grid_key_dx<grid_type::dims,int> blk; + bool out = grid.template getInsertBlockOffset<ite_type>(itg,key,blk,offset); - auto blockId = grid.getBlockLinId(blk); + auto blockId = grid.getBlockLinId(blk); - launch_insert_sparse_lambda_call<grid_type::dims>::call(grid,f1,f2,blockId,itg,key,keyg,offset,is_block_empty); + launch_insert_sparse_lambda_call<grid_type::dims>::call(grid,f1,f2,blockId,itg,key,keyg,offset,is_block_empty,!not_active); __syncthreads(); diff --git a/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh b/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh index 9fba5a678cca86127179d5ae8659b8dea8addf9d..66d1cbbdad2e316c22a311009a110cd122907de5 100644 --- a/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh +++ b/src/Grid/cuda/grid_dist_id_iterator_gpu.cuh @@ -220,7 +220,9 @@ class grid_dist_id_iterator_gpu Box<Decomposition::dims,int> range_box(start,stop); Box<Decomposition::dims,int> kbox; range_box -= gdb_ext.get(g_c).origin; - range_box.Intersect(gdb_ext.get(g_c).Dbox,kbox); + bool intersect = range_box.Intersect(gdb_ext.get(g_c).Dbox,kbox); + + if (intersect == false) {continue;} auto & lg = loc_grids.get(g_c); @@ -234,7 +236,7 @@ class grid_dist_id_iterator_gpu for (int i = 0 ; i < Decomposition::dims ; i++) { - itd.origin.set_d(i,gdb_ext.get(g_c).origin.get(i) + ite.start.get(i)); + itd.origin.set_d(i,gdb_ext.get(g_c).origin.get(i)); itd.start_base.set_d(i,kbox.getKP1().get(i) % lg.getBlockEdgeSize() + ite.start.get(i)); } diff --git a/src/Grid/cuda/grid_dist_id_kernels.cuh b/src/Grid/cuda/grid_dist_id_kernels.cuh index 2af79459a3d6c82d6f6e8ab0315bc4e4f91eed53..e01fd5199584642bf9a5fa2d4c98a83647985efc 100644 --- a/src/Grid/cuda/grid_dist_id_kernels.cuh +++ b/src/Grid/cuda/grid_dist_id_kernels.cuh @@ -47,24 +47,28 @@ struct ite_gpu_dist key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\ key.set_d(2,threadIdx.z + blockIdx.z * blockDim.z + ite_gpu.start.get(2));\ \ + bool inactive = false;\ + \ keyg.set_d(0,key.get(0) + ite_gpu.origin.get(0));\ keyg.set_d(1,key.get(1) + ite_gpu.origin.get(1));\ keyg.set_d(2,key.get(2) + ite_gpu.origin.get(2));\ \ if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\ - {return;} + {inactive = true;} #define GRID_ID_2_GLOBAL(ite_gpu) grid_key_dx<2,int> key;\ grid_key_dx<2,int> keyg;\ key.set_d(0,threadIdx.x + blockIdx.x * blockDim.x + ite_gpu.start.get(0));\ key.set_d(1,threadIdx.y + blockIdx.y * blockDim.y + ite_gpu.start.get(1));\ + \ + bool inactive = false;\ \ keyg.set_d(0,key.get(0) + ite_gpu.origin.get(0));\ keyg.set_d(1,key.get(1) + ite_gpu.origin.get(1));\ \ if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1))\ - {return;} + {inactive = true;} #endif diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp index 78da5846259b97c6f98b9efffc85ab149d88aef3..efa4a81557ff2396372fef3fbe514c949c01610d 100644 --- a/src/Grid/grid_dist_id.hpp +++ b/src/Grid/grid_dist_id.hpp @@ -26,6 +26,7 @@ #include "grid_dist_id_comm.hpp" #include "HDF5_wr/HDF5_wr.hpp" #include "SparseGrid/SparseGrid.hpp" +#include "lib/pdata.hpp" #ifdef __NVCC__ #include "cuda/grid_dist_id_kernels.cuh" #include "Grid/cuda/grid_dist_id_iterator_gpu.cuh" @@ -863,7 +864,21 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi for (size_t i = 0 ; i < dim ; i++) { if (g_sz[i] < 2) - std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " distributed grids with size smaller than 2 are not supported\n"; + {std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " distributed grids with size smaller than 2 are not supported\n";} + } + } + + /*! \brief Check the domain is valid + * + * \param dom domain is valid + * + */ + inline void check_domain(const Box<dim,St> & dom) + { + for (size_t i = 0 ; i < dim ; i++) + { + if (dom.getLow(i) >= dom.getHigh(i)) + {std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " error the simulation domain is invalid\n";} } } @@ -944,7 +959,7 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi * \param bc boundary conditions * */ - inline void InitializeDecomposition(const size_t (& g_sz)[dim], const size_t (& bc)[dim]) + inline void InitializeDecomposition(const size_t (& g_sz)[dim], const size_t (& bc)[dim], const grid_sm<dim,void> & g_dist = grid_sm<dim,void>()) { // fill the global size of the grid for (size_t i = 0 ; i < dim ; i++) {this->g_sz[i] = g_sz[i];} @@ -960,9 +975,15 @@ class grid_dist_id : public grid_dist_id_comm<dim,St,T,Decomposition,Memory,devi for (size_t i = 0 ; i < dim ; i++) {div[i] = openfpm::math::round_big_2(pow(n_sub,1.0/dim));} + if (g_dist.size(0) != 0) + { + for (size_t i = 0 ; i < dim ; i++) + {div[i] = g_dist.size(i);} + } + // Create the sub-domains dec.setParameters(div,domain,bc,ghost); - dec.decompose(); + dec.decompose(dec_options::DEC_SKIP_ICELL); } /*! \brief Initialize the grid @@ -1440,6 +1461,7 @@ public: if (opt >> 32 != 0) {this->setDecompositionGranularity(opt >> 32);} + check_domain(domain); InitializeCellDecomposer(g_sz,p.bc); InitializeDecomposition(g_sz, p.bc); InitializeStructures(g_sz); @@ -1456,7 +1478,7 @@ public: * \warning In very rare case the ghost part can be one point bigger than the one specified * */ - grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,long int> & g, const periodicity<dim> & p, size_t opt = 0) + grid_dist_id(const size_t (& g_sz)[dim],const Box<dim,St> & domain, const Ghost<dim,long int> & g, const periodicity<dim> & p, size_t opt = 0, const grid_sm<dim,void> & g_dec = grid_sm<dim,void>()) :domain(domain),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()),ginfo(g_sz),ginfo_v(g_sz) { #ifdef SE_CLASS2 @@ -1466,11 +1488,12 @@ public: if (opt >> 32 != 0) {this->setDecompositionGranularity(opt >> 32);} + check_domain(domain); InitializeCellDecomposer(g_sz,p.bc); ghost = convert_ghost(g,cd_sm); - InitializeDecomposition(g_sz,p.bc); + InitializeDecomposition(g_sz,p.bc,g_dec); // an empty openfpm::vector<Box<dim,long int>> empty; @@ -1479,6 +1502,7 @@ public: InitializeStructures(g_sz,empty,g,false); } + /*! \brief It construct a grid on the full domain restricted * to the set of boxes specified * @@ -1504,6 +1528,7 @@ public: check_new(this,8,GRID_DIST_EVENT,4); #endif + check_domain(domain); InitializeCellDecomposer(g_sz,p.bc); ghost = convert_ghost(g,cd_sm); @@ -1743,6 +1768,11 @@ public: #ifdef __NVCC__ + /*! \brief Insert point in the grid + * + * \param f1 lambda function to insert point + * \param f2 lambda function to set points + */ template<typename lambda_t1, typename lambda_t2> void addPoints(lambda_t1 f1, lambda_t2 f2) { @@ -1752,6 +1782,13 @@ public: it.template launch<1>(launch_insert_sparse(),f1,f2); } + /*! \brief Insert point in the grid between start and stop + * + * \param start point + * \param stop point + * \param f1 lambda function to insert point + * \param f2 lambda function to set points + */ template<typename lambda_t1, typename lambda_t2> void addPoints(grid_key_dx<dim> k1, grid_key_dx<dim> k2, lambda_t1 f1, lambda_t2 f2) { @@ -1985,6 +2022,18 @@ public: return v_cl; } + /*! \brief Eliminate many internal temporary buffer you can use this between flushes if you get some out of memory + * + * + */ + void removeUnusedBuffers() + { + for (int i = 0 ; i < loc_grid.size() ; i++) + { + loc_grid.get(i).removeUnusedBuffers(); + } + } + /*! \brief Indicate that this grid is not staggered * * \return false @@ -2074,10 +2123,7 @@ public: * */ template <unsigned int p,typename bg_key>inline auto insert(const grid_dist_key_dx<dim,bg_key> & v1) - -> typename std::add_lvalue_reference - < - decltype(loc_grid.get(v1.getSub()).template insert<p>(v1.getKey())) - >::type + -> decltype(loc_grid.get(v1.getSub()).template insert<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2104,10 +2150,7 @@ public: * */ template <unsigned int p,typename bg_key>inline auto insertFlush(const grid_dist_key_dx<dim,bg_key> & v1) - -> typename std::add_lvalue_reference - < - decltype(loc_grid.get(v1.getSub()).template insertFlush<p>(v1.getKey())) - >::type + -> decltype(loc_grid.get(v1.getSub()).template insertFlush<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2145,7 +2188,7 @@ public: */ template <unsigned int p, typename bg_key> inline auto get(const grid_dist_key_dx<dim,bg_key> & v1) - -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type + -> decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2162,7 +2205,8 @@ public: * */ template <unsigned int p = 0> - inline auto get(const grid_dist_g_dx<device_grid> & v1) const -> typename std::add_lvalue_reference<decltype(v1.getSub()->template get<p>(v1.getKey()))>::type + inline auto get(const grid_dist_g_dx<device_grid> & v1) const + -> decltype(v1.getSub()->template get<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2179,7 +2223,7 @@ public: * */ template <unsigned int p = 0> - inline auto get(const grid_dist_g_dx<device_grid> & v1) -> typename std::add_lvalue_reference<decltype(v1.getSub()->template get<p>(v1.getKey()))>::type + inline auto get(const grid_dist_g_dx<device_grid> & v1) -> decltype(v1.getSub()->template get<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2196,7 +2240,7 @@ public: * */ template <unsigned int p = 0> - inline auto get(const grid_dist_lin_dx & v1) const -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type + inline auto get(const grid_dist_lin_dx & v1) const -> decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2213,7 +2257,7 @@ public: * */ template <unsigned int p = 0> - inline auto get(const grid_dist_lin_dx & v1) -> typename std::add_lvalue_reference<decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey()))>::type + inline auto get(const grid_dist_lin_dx & v1) -> decltype(loc_grid.get(v1.getSub()).template get<p>(v1.getKey())) { #ifdef SE_CLASS2 check_valid(this,8); @@ -2566,6 +2610,35 @@ public: } } + /*! \brief apply a convolution using the stencil N + * + * + */ + template<unsigned int stencil_size, typename v_type, typename lambda_f, typename ... ArgsT > + void conv_cross_ids(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args) + { + for (int i = 0 ; i < loc_grid.size() ; i++) + { + Box<dim,long int> inte; + + Box<dim,long int> base; + for (int j = 0 ; j < dim ; j++) + { + base.setLow(j,(long int)start.get(j) - (long int)gdb_ext.get(i).origin.get(j)); + base.setHigh(j,(long int)stop.get(j) - (long int)gdb_ext.get(i).origin.get(j)); + } + + Box<dim,long int> dom = gdb_ext.get(i).Dbox; + + bool overlap = dom.Intersect(base,inte); + + if (overlap == true) + { + loc_grid.get(i).template conv_cross_ids<stencil_size,v_type>(inte.getKP1(),inte.getKP2(),func,args...); + } + } + } + /*! \brief apply a convolution using the stencil N * * @@ -2982,7 +3055,11 @@ public: { // Save the background values T bv; - meta_copy<T>::meta_copy_(bv,loc_grid.get(0).getBackgroundValue()); + + copy_aggregate_dual<decltype(loc_grid.get(0).getBackgroundValue()), + T> ca(loc_grid.get(0).getBackgroundValue(),bv); + + boost::mpl::for_each_ref<boost::mpl::range_c<int,0,T::max_prop>>(ca); if (!(opt & NO_GDB_EXT_SWITCH)) { @@ -3037,8 +3114,32 @@ public: } else { - loc_grid.swap(loc_grid_old); - gdb_ext_old.swap(gdb_ext); + for (int i = 0 ; i < gdb_ext_old.size() ; i++) + { + auto & lg = loc_grid_old.get(i); + auto it_src = lg.getIterator(gdb_ext_old.get(i).Dbox.getKP1(),gdb_ext_old.get(i).Dbox.getKP2()); + auto & dg = loc_grid.get(0); + grid_key_dx<dim> kp1 = gdb_ext.get(0).Dbox.getKP1(); + + grid_key_dx<dim> orig; + for (int j = 0 ; j < dim ; j++) + { + orig.set_d(j,gdb_ext_old.get(i).origin.get(j)); + } + + while (it_src.isNext()) + { + auto key = it_src.get(); + grid_key_dx<dim> key_dst; + + for (int j = 0 ; j < dim ; j++) + {key_dst.set_d(j,key.get(j) + orig.get(j) + kp1.get(j));} + + dg.get_o(key_dst) = lg.get_o(key); + + ++it_src; + } + } } } @@ -3073,6 +3174,34 @@ public: return this->ig_box; } + void print_stats() + { + std::cout << "-- REPORT --" << std::endl; +#ifdef ENABLE_GRID_DIST_ID_PERF_STATS + std::cout << "Processor: " << v_cl.rank() << " Time spent in packing data: " << tot_pack << std::endl; + std::cout << "Processor: " << v_cl.rank() << " Time spent in sending and receving data: " << tot_sendrecv << std::endl; + std::cout << "Processor: " << v_cl.rank() << " Time spent in merging: " << tot_merge << std::endl; + std::cout << "Processor: " << v_cl.rank() << " Time spent in local merging: " << tot_loc_merge << std::endl; +#else + + std::cout << "Enable ENABLE_GRID_DIST_ID_PERF_STATS if you want to activate this feature" << std::endl; + +#endif + } + + void clear_stats() + { +#ifdef ENABLE_GRID_DIST_ID_PERF_STATS + tot_pack = 0; + tot_sendrecv = 0; + tot_merge = 0; +#else + + std::cout << "Enable ENABLE_GRID_DIST_ID_PERF_STATS if you want to activate this feature" << std::endl; + +#endif + } + #ifdef __NVCC__ /*! \brief Set the number inserts each GPU thread do @@ -3176,16 +3305,21 @@ public: }; -template<unsigned int dim, typename St, typename T> -using sgrid_dist_id = grid_dist_id<dim,St,T,CartDecomposition<dim,St>,HeapMemory,sgrid_cpu<dim,T,HeapMemory>>; +template<unsigned int dim, typename St, typename T, typename Memory = HeapMemory, typename Decomposition = CartDecomposition<dim,St> > +using sgrid_dist_id = grid_dist_id<dim,St,T,Decomposition,Memory,sgrid_cpu<dim,T,Memory>>; -template<unsigned int dim, typename St, typename T> -using sgrid_dist_soa = grid_dist_id<dim,St,T,CartDecomposition<dim,St>,HeapMemory,sgrid_soa<dim,T,HeapMemory>>; +template<unsigned int dim, typename St, typename T, typename Memory = HeapMemory, typename Decomposition = CartDecomposition<dim,St>> +using sgrid_dist_soa = grid_dist_id<dim,St,T,Decomposition,Memory,sgrid_soa<dim,T,Memory>>; +template<unsigned int dim, typename St, typename T, typename devg, typename Memory = HeapMemory, typename Decomposition = CartDecomposition<dim,St>> +using grid_dist_id_devg = grid_dist_id<dim,St,T,Decomposition,Memory,devg>; #ifdef __NVCC__ -template<unsigned int dim, typename St, typename T> -using sgrid_dist_id_gpu = grid_dist_id<dim,St,T,CartDecomposition<dim,St,CudaMemory,memory_traits_inte>,CudaMemory,SparseGridGpu<dim,T>>; +template<unsigned int dim, typename St, typename T, typename Memory = CudaMemory, typename Decomposition = CartDecomposition<dim,St,CudaMemory,memory_traits_inte> > +using sgrid_dist_id_gpu = grid_dist_id<dim,St,T,Decomposition,Memory,SparseGridGpu<dim,T>>; + +template<unsigned int dim, typename St, typename T, typename Memory = CudaMemory, typename Decomposition = CartDecomposition<dim,St,CudaMemory,memory_traits_inte> > +using sgrid_dist_sid_gpu = grid_dist_id<dim,St,T,Decomposition,Memory,SparseGridGpu<dim,T,default_edge<dim>::type::value,default_edge<dim>::tb::value,int>>; #endif #endif diff --git a/src/Grid/grid_dist_id_comm.hpp b/src/Grid/grid_dist_id_comm.hpp index d9eefe479d0c1ca6dfb19c92547634cd37cbc3e2..5508aa8f1d5fc4ad356f6a01b989f4d72a824a61 100644 --- a/src/Grid/grid_dist_id_comm.hpp +++ b/src/Grid/grid_dist_id_comm.hpp @@ -12,6 +12,8 @@ #include "Grid/copy_grid_fast.hpp" #include "grid_dist_util.hpp" #include "util/common_pdata.hpp" +#include "lib/pdata.hpp" + /*! \brief Unpack selector * @@ -159,7 +161,7 @@ class grid_dist_id_comm openfpm::vector<size_t> send_size; //! receiving buffers in case of dynamic - openfpm::vector<BMemory<Memory>> recv_buffers; + openfpm::vector_fr<BMemory<Memory>> recv_buffers; struct rp_id { @@ -191,6 +193,11 @@ class grid_dist_id_comm openfpm::vector<void *> pointers; openfpm::vector<void *> pointers2; + //! header unpacker info + openfpm::vector_gpu<aggregate<void *,void *,int>> pointers_h; + int n_headers_slot = 1; + openfpm::vector_gpu<aggregate<size_t,size_t,unsigned int>> headers; + //! Receiving option size_t opt; @@ -437,9 +444,9 @@ class grid_dist_id_comm if (send_prc_queue.size() == 0) { - v_cl.sendrecvMultipleMessagesNBX(send_prc_queue.size(),NULL, - NULL,NULL, - receive_dynamic,this); + v_cl.sendrecvMultipleMessagesNBX(send_prc_queue.size(),NULL, + NULL,NULL, + receive_dynamic,this); } else { @@ -452,7 +459,7 @@ class grid_dist_id_comm recv_proc.sort(); - openfpm::vector<BMemory<Memory>> tmp; + openfpm::vector_fr<BMemory<Memory>> tmp; tmp.resize(recv_proc.size()); for (int i = 0 ; i < recv_proc.size() ; i++) @@ -577,7 +584,7 @@ class grid_dist_id_comm size_t sub_id = eg_box.get(ei).bid.get(le_id).sub; // sub-grid where to unpack - auto sub2 = loc_grid.get(sub_id).getIterator(box.getKP1(),box.getKP2()); + auto sub2 = loc_grid.get(sub_id).getIterator(box.getKP1(),box.getKP2(),false); rem_copy_opt opt_ = rem_copy_opt::NONE_OPT; if (opt & SKIP_LABELLING) @@ -605,7 +612,132 @@ class grid_dist_id_comm } } + template<typename mem, typename header_type,unsigned ... prp> + void unpack_data_to_ext_ghost_with_header(ExtPreAlloc<mem> & emem, + openfpm::vector<device_grid> & loc_grid, + header_type & headers, + size_t i, + const openfpm::vector<ep_box_grid<dim>> & eg_box, + const std::unordered_map<size_t,size_t> & g_id_to_external_ghost_box, + const openfpm::vector<e_box_multi<dim>> & eb_gid_list, + Unpack_stat & ps, + size_t opt) + { + // Unpack the ghost box global-id + + size_t g_id; + // we move from device to host the gid + g_id = headers.template get<0>(i); + ps.addOffset(sizeof(size_t)); + + size_t l_id = 0; + // convert the global id into local id + auto key = g_id_to_external_ghost_box.find(g_id); + + if (key != g_id_to_external_ghost_box.end()) // FOUND + {l_id = key->second;} + else + { + // NOT FOUND + + // It must be always found, if not it mean that the processor has no-idea of + // what is stored and conseguently do not know how to unpack, print a critical error + // and return + + std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " Critical, cannot unpack object, because received data cannot be interpreted\n"; + + return; + } + + + // we unpack into the last eb_gid_list that is always big enought to + // unpack the information + + size_t le_id = eb_gid_list.get(l_id).full_match; + size_t ei = eb_gid_list.get(l_id).e_id; + + // Get the external ghost box associated with the packed information + Box<dim,long int> box = eg_box.get(ei).bid.get(le_id).l_e_box; + size_t sub_id = eg_box.get(ei).bid.get(le_id).sub; + + // sub-grid where to unpack + auto sub2 = loc_grid.get(sub_id).getIterator(box.getKP1(),box.getKP2(),false); + + rem_copy_opt opt_ = rem_copy_opt::NONE_OPT; + if (opt & SKIP_LABELLING) + {opt_ = rem_copy_opt::KEEP_GEOMETRY;} + // Unpack + loc_grid.get(sub_id).remove(box); + Unpacker<device_grid,mem>::template unpack_with_header<decltype(sub2),decltype(headers),decltype(v_cl.getmgpuContext()),prp...> + (emem, + sub2, + loc_grid.get(sub_id), + headers, + i, + ps, + v_cl.getmgpuContext(), + opt_); + + // Copy the information on the other grid + for (long int j = 0 ; j < (long int)eb_gid_list.get(l_id).eb_list.size() ; j++) + { + size_t nle_id = eb_gid_list.get(l_id).eb_list.get(j); + if (nle_id != le_id) + { +// size_t nle_id = eb_gid_list.get(l_id).eb_list.get(j); + size_t n_sub_id = eg_box.get(ei).bid.get(nle_id).sub; + + Box<dim,long int> box = eg_box.get(ei).bid.get(nle_id).l_e_box; + Box<dim,long int> rbox = eg_box.get(ei).bid.get(nle_id).lr_e_box; + + loc_grid.get(n_sub_id).remove(box); + loc_grid.get(n_sub_id).copy_to(loc_grid.get(sub_id),rbox,box); + } + } + } + + template<unsigned int ... prp> + void fill_headers(size_t opt) + { + if ((opt & KEEP_PROPERTIES) == 0 && device_grid::is_unpack_header_supported()) + { + headers.resize(n_headers_slot * recv_buffers.size()); + + Memory result; + result.allocate(sizeof(int)); + + pointers_h.resize(recv_buffers.size()); + + for ( size_t i = 0 ; i < recv_buffers.size() ; i++ ) + { + pointers_h.template get<0>(i) = recv_buffers.get(i).getDevicePointer(); + pointers_h.template get<1>(i) = (unsigned char *)recv_buffers.get(i).getDevicePointer() + recv_buffers.get(i).size(); + } + + pointers_h.template hostToDevice<0,1>(); + + while(1) + { + for ( size_t i = 0 ; i < recv_buffers.size() ; i++ ) + {pointers_h.template get<2>(i) = 0;} + pointers_h.template hostToDevice<2>(); + *(int *)result.getPointer() = 0; + result.hostToDevice(); + + device_grid::template unpack_headers<decltype(pointers_h),decltype(headers),decltype(result),prp ...>(pointers_h,headers,result,n_headers_slot); + result.deviceToHost(); + + if (*(int *)result.getPointer() == 0) {break;} + + n_headers_slot *= 2; + headers.resize(n_headers_slot * recv_buffers.size()); + + } + + headers.template deviceToHost<0,1,2>(); + } + } template<unsigned ... prp> void merge_received_data_get(openfpm::vector<device_grid> & loc_grid, @@ -642,22 +774,52 @@ class grid_dist_id_comm } else { - // Unpack the object - for ( size_t i = 0 ; i < recv_buffers.size() ; i++ ) + fill_headers<prp ...>(opt); + + if (headers.size() != 0) { - Unpack_stat ps; - size_t mark_here = ps.getOffset(); + // Unpack the object + for ( size_t i = 0 ; i < recv_buffers.size() ; i++ ) + { + Unpack_stat ps; + size_t mark_here = ps.getOffset(); - ExtPreAlloc<BMemory<Memory>> mem(recv_buffers.get(i).size(),recv_buffers.get(i)); + ExtPreAlloc<BMemory<Memory>> mem(recv_buffers.get(i).size(),recv_buffers.get(i)); - // for each external ghost box - while (ps.getOffset() - mark_here < recv_buffers.get(i).size()) + int j = 0; + + // for each external ghost box + while (ps.getOffset() - mark_here < recv_buffers.get(i).size()) + { + // Unpack the ghost box global-id + + unpack_data_to_ext_ghost_with_header<BMemory<Memory>,decltype(headers),prp ...>(mem,loc_grid,headers,i*n_headers_slot+j, + eg_box,g_id_to_external_ghost_box,eb_gid_list, + ps,opt); + + j++; + } + } + } + else + { + // Unpack the object + for ( size_t i = 0 ; i < recv_buffers.size() ; i++ ) { - // Unpack the ghost box global-id + Unpack_stat ps; + size_t mark_here = ps.getOffset(); - unpack_data_to_ext_ghost<BMemory<Memory>,prp ...>(mem,loc_grid,i, - eg_box,g_id_to_external_ghost_box,eb_gid_list, - ps,opt); + ExtPreAlloc<BMemory<Memory>> mem(recv_buffers.get(i).size(),recv_buffers.get(i)); + + // for each external ghost box + while (ps.getOffset() - mark_here < recv_buffers.get(i).size()) + { + // Unpack the ghost box global-id + + unpack_data_to_ext_ghost<BMemory<Memory>,prp ...>(mem,loc_grid,i, + eg_box,g_id_to_external_ghost_box,eb_gid_list, + ps,opt); + } } } } @@ -1060,6 +1222,11 @@ public: for (int i = 0 ; i < loc_grid.size() ; i++) {opt &= (loc_grid.get(i).isSkipLabellingPossible())?(int)-1:~SKIP_LABELLING;} + #ifdef ENABLE_GRID_DIST_ID_PERF_STATS + timer packing_time; + packing_time.start(); + #endif + if (!(opt & SKIP_LABELLING)) { // first we initialize the pack buffer on all internal grids @@ -1086,7 +1253,7 @@ public: // Pack a size_t for the internal ghost id Packer<size_t,Memory>::packRequest(req); // Create a sub grid iterator spanning the internal ghost layer - auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2()); + auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2(),false); // get the size to pack Packer<device_grid,Memory>::template packRequest<decltype(sub_it),prp...>(loc_grid.get(sub_id),sub_it,req); @@ -1142,7 +1309,7 @@ public: Packer<size_t,Memory>::pack(prAlloc_prp,g_id,sts); prAlloc_prp.hostToDevice(prAlloc_prp.getOffset(),prAlloc_prp.getOffsetEnd()); // Create a sub grid iterator spanning the internal ghost layer - auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2()); + auto sub_it = loc_grid.get(sub_id).getIterator(g_ig_box.getKP1(),g_ig_box.getKP2(),false); // and pack the internal ghost grid Packer<device_grid,Memory>::template pack<decltype(sub_it),prp...>(prAlloc_prp,loc_grid.get(sub_id),sub_it,sts); } @@ -1192,6 +1359,13 @@ public: delete &prAlloc_prp; } + #ifdef ENABLE_GRID_DIST_ID_PERF_STATS + packing_time.stop(); + tot_pack += packing_time.getwct(); + timer sendrecv_time; + sendrecv_time.start(); + #endif + for ( size_t i = 0 ; i < ig_box.size() ; i++ ) { // This function send (or queue for sending) the information @@ -1210,8 +1384,22 @@ public: queue_recv_data_get<prp_object>(eg_box,prp_recv,prRecv_prp); + #ifdef ENABLE_GRID_DIST_ID_PERF_STATS + sendrecv_time.stop(); + tot_sendrecv += sendrecv_time.getwct(); + timer merge_loc_time; + merge_loc_time.start(); + #endif + ghost_get_local<prp...>(loc_ig_box,loc_eg_box,gdb_ext,loc_grid,g_id_to_external_ghost_box,ginfo,use_bx_def,opt); + #ifdef ENABLE_GRID_DIST_ID_PERF_STATS + merge_loc_time.stop(); + tot_loc_merge += merge_loc_time.getwct(); + timer merge_time; + merge_time.start(); + #endif + for (size_t i = 0 ; i < loc_grid.size() ; i++) {loc_grid.get(i).removeAddUnpackReset();} @@ -1224,6 +1412,11 @@ public: for (size_t i = 0 ; i < loc_grid.size() ; i++) {loc_grid.get(i).template removeAddUnpackFinalize<prp ...>(v_cl.getmgpuContext(),opt_);} + #ifdef ENABLE_GRID_DIST_ID_PERF_STATS + merge_time.stop(); + tot_merge += merge_time.getwct(); + #endif + prRecv_prp.decRef(); delete &prRecv_prp; } diff --git a/src/Grid/grid_dist_util.hpp b/src/Grid/grid_dist_util.hpp index b254b6fd7c163bf5243a5eed0338d7c25cd630bd..508764d50e6557e2394f2295b8990d0c574c98f2 100644 --- a/src/Grid/grid_dist_util.hpp +++ b/src/Grid/grid_dist_util.hpp @@ -138,13 +138,13 @@ inline void create_gdb_ext(openfpm::vector<GBoxes<Decomposition::dims>> & gdb_ex SpaceBox<Decomposition::dims,long int> sp_t = cd_sm.convertDomainSpaceIntoGridUnits(sp,dec.periodicity()); SpaceBox<Decomposition::dims,long int> sp_tg = cd_sm.convertDomainSpaceIntoGridUnits(sp_g,dec.periodicity()); -/* for (size_t i = 0 ; i < Decomposition::dims ; i++) + for (size_t i = 0 ; i < Decomposition::dims ; i++) { if (sp_t.getLow(i) < sp_tg.getLow(i)) {sp_tg.setLow(i,sp_t.getLow(i));} if (sp_t.getHigh(i) > sp_tg.getHigh(i)) {sp_tg.setHigh(i,sp_t.getHigh(i));} - }*/ + } if (use_bx_def == true) { diff --git a/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp b/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp index 1174ad725b88e71adc2d2b7d11ba644c0af15a09..3c6fd85e8108d6ebb2a0ca0951396dab0afa3e79 100644 --- a/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp +++ b/src/Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp @@ -145,6 +145,85 @@ BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test ) } +BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_copy_test ) +{ + + // Input data + size_t k = 2400; + + float ghost_part = 0.01; + + // Domain + Box<2,float> domain({0.0,0.0},{1.0,1.0}); + + Vcluster<> & v_cl = create_vcluster(); + + // Skip this test on big scale + if (v_cl.getProcessingUnits() >= 32) + return; + + // grid size + size_t sz[2]; + sz[0] = k; + sz[1] = k; + + // Ghost + Ghost<2,float> g(ghost_part); + + // Distributed grid with id decomposition + grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist(sz,domain,g); + grid_dist_id<2, float, aggregate<float>, CartDecomposition<2,float>> g_dist_copy(g_dist.getDecomposition(),sz,g); + + g_dist.load("test_data/test_data_three.h5"); + + // Copy + + auto dom_sc = g_dist.getDomainIterator(); + auto dom_ds = g_dist_copy.getDomainIterator(); + while (dom_sc.isNext()) + { + auto key_sc = dom_sc.get(); + auto key_ds = dom_ds.get(); + g_dist_copy.template get<0>(key_ds) = g_dist.template get<0>(key_sc); + ++dom_sc; + ++dom_ds; + } + + + auto it = g_dist_copy.getDomainIterator(); + + size_t count = 0; + + bool match = true; + while (it.isNext()) + { + //key + auto key = it.get(); + + //BOOST_CHECK_CLOSE(g_dist.template get<0>(key),1,0.0001); + //std::cout << "Element: " << g_dist.template get<0>(key) << std::endl; + + auto keyg = g_dist_copy.getGKey(key); + + match &= g_dist_copy.template get<0>(key) == keyg.get(0); + + ++it; + count++; + } + + openfpm::vector<size_t> count_total; + v_cl.allGather(count,count_total); + v_cl.execute(); + + size_t sum = 0; + + for (size_t i = 0; i < count_total.size(); i++) + sum += count_total.get(i); + + BOOST_REQUIRE_EQUAL(sum, (size_t)k*k); + BOOST_REQUIRE_EQUAL(match,true); +} + BOOST_AUTO_TEST_CASE( grid_dist_id_hdf5_load_test_diff_proc ) { diff --git a/src/Grid/tests/grid_dist_id_unit_test.cpp b/src/Grid/tests/grid_dist_id_unit_test.cpp index d31a763ae0fdd69a190b43234280d1e93dfbb049..a6d4061e0c2be72e30cb247cf2c761c23f508915 100644 --- a/src/Grid/tests/grid_dist_id_unit_test.cpp +++ b/src/Grid/tests/grid_dist_id_unit_test.cpp @@ -1313,7 +1313,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k) periodicity<3> pr = {{PERIODIC,PERIODIC,PERIODIC}}; // Distributed grid with id decomposition - grid_dist_id<3, float, aggregate<long int>, CartDecomposition<3,float>> g_dist(sz,domain,g,pr); + grid_dist_id<3, float, aggregate<long int,double>, CartDecomposition<3,float>> g_dist(sz,domain,g,pr); // check the consistency of the decomposition bool val = g_dist.getDecomposition().check_consistency(); @@ -1332,6 +1332,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k) auto key = dom.get(); g_dist.template get<0>(key) = -6.0; + g_dist.template get<1>(key) = -6.0; // Count the points count++; @@ -1356,6 +1357,14 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k) g_dist.template get<0>(key.move(2,1)) += 1.0; g_dist.template get<0>(key.move(2,-1)) += 1.0; + + g_dist.template get<1>(key.move(0,1)) += 1.0; + g_dist.template get<1>(key.move(0,-1)) += 1.0; + g_dist.template get<1>(key.move(1,1)) += 1.0; + g_dist.template get<1>(key.move(1,-1)) += 1.0; + g_dist.template get<1>(key.move(2,1)) += 1.0; + g_dist.template get<1>(key.move(2,-1)) += 1.0; + ++dom; } } @@ -1375,12 +1384,14 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k) } g_dist.ghost_put<add_,0>(); + g_dist.ghost_put<add_,1>(); + if (count != 0) BOOST_REQUIRE_EQUAL(correct, false); // sync the ghosts - g_dist.ghost_get<0>(); + g_dist.ghost_get<0,1>(); correct = true; @@ -1392,6 +1403,7 @@ void Test3D_periodic_put(const Box<3,float> & domain, long int k) auto key = dom_gi2.get(); correct &= (g_dist.template get<0>(key) == 0); + correct &= (g_dist.template get<1>(key) == 0); ++dom_gi2; } @@ -2397,51 +2409,258 @@ BOOST_AUTO_TEST_CASE( grid_dist_domain_ghost_3D_put_create_check ) TestXD_ghost_put_create(sg_dist3,k); } + BOOST_AUTO_TEST_CASE( grid_dist_ghost_zero_size ) +{ + // Test grid periodic + + Box<3,double> domain({0,0,0},{365.376,365.376,102}); + + Vcluster<> & v_cl = create_vcluster(); + + if ( v_cl.getProcessingUnits() > 32 ) + {return;} + + BOOST_TEST_CHECKPOINT( "Testing grid zero ghost"); + + // grid size + size_t sz[3]; + sz[0] = 53; + sz[1] = 53; + sz[2] = 10; + + // Ghost + Ghost<3,long int> g(0); + + // periodicity + periodicity<3> pr = {{NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}}; + + // Distributed grid with id decomposition + grid_dist_id<3, double, aggregate<long int, int>> g_dist(sz,domain,g,pr); + + auto it = g_dist.getDomainIterator(); + + size_t count = 0; + + while (it.isNext()) + { + auto k = it.get(); + + ++count; + + ++it; + } + + v_cl.sum(count); + v_cl.execute(); + + BOOST_REQUIRE_EQUAL(count,53*53*10); +} + + +BOOST_AUTO_TEST_CASE(grid_dist_id_smb_write_out_1_proc) { // Test grid periodic + { + Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0}); - Box<3,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0}); + Vcluster<> & v_cl = create_vcluster(); - Vcluster<> & v_cl = create_vcluster(); + if ( v_cl.getProcessingUnits() > 1 ) + {return;} - if ( v_cl.getProcessingUnits() > 32 ) - {return;} + // grid size + size_t sz[2]; + sz[0] = 16; + sz[1] = 16; - BOOST_TEST_CHECKPOINT( "Testing grid zero ghost"); + // Ghost + Ghost<2,long int> g(0); - // grid size - size_t sz[3]; - sz[0] = 32; - sz[1] = 32; - sz[2] = 32; + // periodicity + periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}}; - // Ghost - Ghost<3,long int> g(0); + typedef grid_cpu<2, aggregate<int>, grid_smb<2,4> > devg; - // periodicity - periodicity<3> pr = {{NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}}; + // Distributed grid with id decomposition + grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr); - // Distributed grid with id decomposition - grid_dist_id<3, float, aggregate<long int, int>> g_dist(sz,domain,g,pr); + auto it = g_smb.getDomainIterator(); - auto it = g_dist.getDomainIterator(); + size_t count = 0; - size_t count = 0; + unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>(); - while (it.isNext()) + while (it.isNext()) + { + auto k = it.get(); + + g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base; + + ++count; + + ++it; + } + + v_cl.sum(count); + v_cl.execute(); + + BOOST_REQUIRE_EQUAL(count,16*16); + + g_smb.write("g_smb_out"); + } +} + +BOOST_AUTO_TEST_CASE(grid_dist_id_zmb_write_out_1_proc) +{ { - auto k = it.get(); + // Test grid periodic - ++count; + Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0}); - ++it; + Vcluster<> & v_cl = create_vcluster(); + + if ( v_cl.getProcessingUnits() > 1 ) + {return;} + + // grid size + size_t sz[2]; + sz[0] = 16; + sz[1] = 16; + + // Ghost + Ghost<2,long int> g(0); + + // periodicity + periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}}; + + typedef grid_cpu<2, aggregate<int>, grid_zmb<2,4,long int> > devg; + + // Distributed grid with id decomposition + grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr); + + auto it = g_smb.getDomainIterator(); + + size_t count = 0; + + unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>(); + + while (it.isNext()) + { + auto k = it.get(); + + g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base; + + ++count; + + ++it; + } + + v_cl.sum(count); + v_cl.execute(); + + BOOST_REQUIRE_EQUAL(count,16*16); + + g_smb.write("g_zmb_out"); } - v_cl.sum(count); - v_cl.execute(); + { + Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0}); + + Vcluster<> & v_cl = create_vcluster(); + + if ( v_cl.getProcessingUnits() > 1 ) + {return;} + + // grid size + size_t sz[2]; + sz[0] = 16; + sz[1] = 16; + + // Ghost + Ghost<2,long int> g(0); + + // periodicity + periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}}; + + typedef grid_cpu<2, aggregate<int>, grid_zm<2,void> > devg; + + // Distributed grid with id decomposition + grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr); + + auto it = g_smb.getDomainIterator(); + + size_t count = 0; + + unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>(); + + while (it.isNext()) + { + auto k = it.get(); + + g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base; - BOOST_REQUIRE_EQUAL(count,32*32*32); + ++count; + + ++it; + } + + v_cl.sum(count); + v_cl.execute(); + + BOOST_REQUIRE_EQUAL(count,16*16); + + g_smb.write("g_zm_out"); + } + + { + Box<2,float> domain({-1.0,-1.0,-1.0},{1.0,1.0,1.0}); + + Vcluster<> & v_cl = create_vcluster(); + + if ( v_cl.getProcessingUnits() > 1 ) + {return;} + + // grid size + size_t sz[2]; + sz[0] = 16; + sz[1] = 16; + + // Ghost + Ghost<2,long int> g(0); + + // periodicity + periodicity<2> pr = {{NON_PERIODIC,NON_PERIODIC}}; + + typedef grid_base<2, aggregate<int>> devg; + + // Distributed grid with id decomposition + grid_dist_id_devg<2, float, aggregate<int>,devg> g_smb(sz,domain,g,pr); + + auto it = g_smb.getDomainIterator(); + + size_t count = 0; + + unsigned char * base = (unsigned char *)g_smb.get_loc_grid(0).getPointer<0>(); + + while (it.isNext()) + { + auto k = it.get(); + + g_smb.template getProp<0>(k) = (unsigned char *)&g_smb.template getProp<0>(k) - base; + + ++count; + + ++it; + } + + v_cl.sum(count); + v_cl.execute(); + + BOOST_REQUIRE_EQUAL(count,16*16); + + g_smb.write("g_sm_out"); + } } BOOST_AUTO_TEST_CASE( grid_dist_copy_construct ) diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu index 5a221b6b18574225952f5602b49d959e0fe36efb..e26743dfb8c54d8b0e91ae85fc3c4fe67d5fa636 100644 --- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu +++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu @@ -1,5 +1,7 @@ +#include <ostream> #define BOOST_TEST_DYN_LINK +#include "config.h" #include <boost/test/unit_test.hpp> #include "Grid/grid_dist_id.hpp" @@ -15,7 +17,8 @@ struct insert_kernel2D sg.init(); - sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1); + if (inactive == false) + {sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1);} __syncthreads(); @@ -33,7 +36,8 @@ struct insert_kernel3D sg.init(); - sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1) + keyg.get(2); + if (inactive == false) + {sg.template insert<p>(key) = c + keyg.get(0) + keyg.get(1) + keyg.get(2);} __syncthreads(); @@ -41,15 +45,6 @@ struct insert_kernel3D } }; -template<unsigned int p> -struct stencil_kernel -{ - template<typename SparseGridGpu_type> - __device__ void operator()(SparseGridGpu_type & sg, ite_gpu<SparseGridGpu_type::d> & ite, float c) - { - // TODO - } -}; BOOST_AUTO_TEST_CASE( sgrid_gpu_test_base ) { @@ -212,12 +207,10 @@ void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2]) gdist.template flush<smax_<0>>(flush_type::FLUSH_ON_DEVICE); gdist.template deviceToHost<0>(); - gdist.write_debug("before_ghost"); gdist.template ghost_get<0>(RUN_ON_DEVICE); gdist.template deviceToHost<0>(); - gdist.write_debug("after_ghost"); // Now we check that ghost is correct @@ -297,6 +290,8 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get ) size_t sz6[2] = {15,15}; sgrid_ghost_get(sz,sz6); + return; + size_t sz2[2] = {170,170}; size_t sz3[2] = {15,15}; sgrid_ghost_get(sz2,sz3); @@ -306,6 +301,90 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_ghost_get ) } +BOOST_AUTO_TEST_CASE( sgrid_gpu_app_point_test ) +{ + size_t sz[3] = {75,75,75}; + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + Ghost<3,long int> g(1); + + Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0}); + + sgrid_dist_id_gpu<3,float,aggregate<float,float,float,float>> gdist(sz,domain,g,bc); + + gdist.template setBackgroundValue<0>(666); + gdist.template setBackgroundValue<1>(666); + gdist.template setBackgroundValue<2>(666); + gdist.template setBackgroundValue<3>(666); + + /////// GPU insert + flush + + Box<3,size_t> box({1,1,1},{sz[0],sz[1],sz[2]}); + + /////// GPU Run kernel + + float c = 5.0; + + typedef typename GetAddBlockType<decltype(gdist)>::type InsertBlockT; + + CudaMemory cmem; + cmem.allocate(sizeof(int)); + CudaMemory cmem_out; + cmem_out.allocate(sizeof(int)); + + *(int *)cmem.getPointer() = 0.0; + *(int *)cmem_out.getPointer() = 0.0; + + cmem.hostToDevice(); + cmem_out.hostToDevice(); + + int * cnt = (int *)cmem.getDevicePointer(); + int * cnt_out = (int *)cmem_out.getDevicePointer(); + + Box<3,size_t> bx({23,23,23},{70,70,70}); + + gdist.addPoints(bx.getKP1(),bx.getKP2(), + [cnt,cnt_out,bx] __device__ (int i, int j, int k) + { + Point<3,int> p({i,j,k}); + + if (bx.isInside(p)) + {atomicAdd(cnt,1);} + else + { + printf("%d %d %d \n",i,j,k); + atomicAdd(cnt_out,1); + } + + return true; + }, + [c] __device__ (InsertBlockT & data, int i, int j, int k) + { + data.template get<0>() = c + i + j; + data.template get<1>() = c + 1000 + i + j; + } + ); + + gdist.template flush<smax_<0>,smax_<1>>(flush_type::FLUSH_ON_DEVICE); + gdist.template ghost_get<0,1>(RUN_ON_DEVICE); + + cmem.deviceToHost(); + cmem_out.deviceToHost(); + + int cnt_host = *(int *)cmem.getPointer(); + int cnt_host_out = *(int *)cmem_out.getPointer(); + + auto & v_cl = create_vcluster(); + + v_cl.sum(cnt_host_out); + v_cl.sum(cnt_host); + v_cl.execute(); + + BOOST_REQUIRE_EQUAL(cnt_host_out,0); + BOOST_REQUIRE_EQUAL(cnt_host,bx.getVolumeKey()); +} + + BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test ) { size_t sz[2] = {164,164}; @@ -389,15 +468,17 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test ) ++it3; } - gdist.write("SGRID"); - BOOST_REQUIRE_EQUAL(match,true); } BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv2_test_3d ) { + #ifdef CUDA_ON_CPU + size_t sz[3] = {20,20,20}; + #else size_t sz[3] = {60,60,60}; + #endif periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; Ghost<3,long int> g(1); diff --git a/src/Grid/tests/sgrid_dist_id_unit_tests.cpp b/src/Grid/tests/sgrid_dist_id_unit_tests.cpp index 4ebcdebabbb69a44c4a566ecb8ab78184a346176..1757dfa31ba7e6d3cb917d04cad2e9f95ad93c16 100644 --- a/src/Grid/tests/sgrid_dist_id_unit_tests.cpp +++ b/src/Grid/tests/sgrid_dist_id_unit_tests.cpp @@ -161,8 +161,6 @@ BOOST_AUTO_TEST_CASE( sgrid_dist_id_basic_test_2D) ++it; } - sg.write("sg_test_write"); - bool match = true; auto it2 = sg.getGridIterator(); @@ -614,7 +612,7 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2_cross auto func = [uFactor,vFactor,deltaT,F,K](Vc::double_v & u_out,Vc::double_v & v_out, Vc::double_v & u,Vc::double_v & v, - cross_stencil_v & us,cross_stencil_v & vs, + cross_stencil_v<double> & us,cross_stencil_v<double> & vs, unsigned char * mask){ u_out = u + uFactor *(us.xm + us.xp + @@ -691,4 +689,250 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2_cross BOOST_REQUIRE_EQUAL(match,true); } +BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_simplified_conv2_crossing_float) +{ + constexpr int U = 0; + constexpr int V = 1; + + constexpr int U_next = 2; + constexpr int V_next = 3; + + constexpr int x = 0; + constexpr int y = 1; + constexpr int z = 2; + + Box<3,float> domain({0.0,0.0,0.0},{2.5,2.5,2.5}); + + // grid size + size_t sz[3] = {32,32,32}; + + // Define periodicity of the grid + periodicity<3> bc = {PERIODIC,PERIODIC,PERIODIC}; + + // Ghost in grid unit + Ghost<3,long int> g(1); + + // deltaT + float deltaT = 1; + + // Diffusion constant for specie U + float du = 2*1e-5; + + // Diffusion constant for specie V + float dv = 1*1e-5; + + // Number of timesteps + size_t timeSteps = 5000; + + // K and F (Physical constant in the equation) + float K = 0.053; + float F = 0.014; + + sgrid_dist_soa<3, float, aggregate<float,float,float,float>> grid(sz,domain,g,bc); + + auto it = grid.getGridIterator(); + + while (it.isNext()) + { + // Get the local grid key + auto key = it.get_dist(); + + // Old values U and V + grid.template insert<U>(key) = 1.0; + grid.template insert<V>(key) = 0.0; + + // Old values U and V + grid.template insert<U_next>(key) = 0.0; + grid.template insert<V_next>(key) = 0.0; + + ++it; + } + + long int x_start = grid.size(0)*1.55f/domain.getHigh(0); + long int y_start = grid.size(1)*1.55f/domain.getHigh(1); + long int z_start = grid.size(1)*1.55f/domain.getHigh(2); + + long int x_stop = grid.size(0)*1.85f/domain.getHigh(0); + long int y_stop = grid.size(1)*1.85f/domain.getHigh(1); + long int z_stop = grid.size(1)*1.85f/domain.getHigh(2); + + grid_key_dx<3> start({x_start,y_start,z_start}); + grid_key_dx<3> stop ({x_stop,y_stop,z_stop}); + auto it_init = grid.getGridIterator(start,stop); + + while (it_init.isNext()) + { + auto key = it_init.get_dist(); + + grid.template insert<U>(key) = 0.5 + (((float)std::rand())/RAND_MAX -0.5)/10.0; + grid.template insert<V>(key) = 0.25 + (((float)std::rand())/RAND_MAX -0.5)/20.0; + + ++it_init; + } + + // spacing of the grid on x and y + float spacing[3] = {grid.spacing(0),grid.spacing(1),grid.spacing(2)}; + // sync the ghost + size_t count = 0; + grid.template ghost_get<U,V>(); + + // because we assume that spacing[x] == spacing[y] we use formula 2 + // and we calculate the prefactor of Eq 2 + float uFactor = deltaT * du/(spacing[x]*spacing[x]); + float vFactor = deltaT * dv/(spacing[x]*spacing[x]); + + + //! \cond [stencil get and use] \endcond + + + auto func = [uFactor,vFactor,deltaT,F,K](Vc::float_v & u_out,Vc::float_v & v_out, + Vc::float_v & u,Vc::float_v & v, + cross_stencil_v<float> & us,cross_stencil_v<float> & vs, + unsigned char * mask){ + + u_out = u + uFactor *(us.xm + us.xp + + us.ym + us.yp + + us.zm + us.zp - 6.0f*u) - deltaT * u*v*v + - deltaT * F * (u - 1.0f); + + v_out = v + vFactor *(vs.xm + vs.xp + + vs.ym + vs.yp + + vs.zm + vs.zp - 6.0f*v) + deltaT * u*v*v + - deltaT * (F+K) * v; + }; + + grid.conv_cross2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + grid.conv_cross2<U,V,U_next,V_next,1>({0,0,0},{(long int)sz[0]-1,(long int)sz[1]-1,(long int)sz[2]-1},func); + + bool match = true; + + { + auto it = grid.getDomainIterator(); + + float max_U = 0.0; + float max_V = 0.0; + grid_dist_key_dx<3> k_max; + while (it.isNext()) + { + // center point + auto Cp = it.get(); + + // plus,minus X,Y,Z + auto mx = Cp.move(0,-1); + auto px = Cp.move(0,+1); + auto my = Cp.move(1,-1); + auto py = Cp.move(1,1); + auto mz = Cp.move(2,-1); + auto pz = Cp.move(2,1); + + // update based on Eq 2 + if ( fabs(grid.get<U>(Cp) + uFactor * ( + grid.get<U>(mz) + + grid.get<U>(pz) + + grid.get<U>(my) + + grid.get<U>(py) + + grid.get<U>(mx) + + grid.get<U>(px) - + 6.0*grid.get<U>(Cp)) + + - deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) + + - deltaT * F * (grid.get<U>(Cp) - 1.0) - grid.get<U_next>(Cp)) > 0.00001 ) + { + match = false; + break; + } + + // update based on Eq 2 + if ( fabs(grid.get<V>(Cp) + vFactor * ( + grid.get<V>(mz) + + grid.get<V>(pz) + + grid.get<V>(my) + + grid.get<V>(py) + + grid.get<V>(mx) + + grid.get<V>(px) - + 6*grid.get<V>(Cp)) + + deltaT * grid.get<U>(Cp) * grid.get<V>(Cp) * grid.get<V>(Cp) + + - deltaT * (F+K) * grid.get<V>(Cp) - grid.get<V_next>(Cp)) > 0.00001 ) + { + match = false; + break; + } + + ++it; + } + } + + BOOST_REQUIRE_EQUAL(match,true); +} + + +BOOST_AUTO_TEST_CASE (sgrid_dist_id_soa_write ) +{ + periodicity<3> bc = {PERIODIC, PERIODIC, PERIODIC}; + + auto & v_cl = create_vcluster<>(); + + if (v_cl.size() > 16) + {return;} + + // Domain + Box<3,double> domain({-0.3,-0.3,-0.3},{1.0,1.0,1.0}); + + // grid size + size_t sz[3]; + sz[0] = 256; + sz[1] = 256; + sz[2] = 256; + + // Ghost + Ghost<3,long int> g(1); + + sgrid_dist_soa<3,double,aggregate<double,double[3]>> sg1(sz,domain,g,bc); + sgrid_dist_id<3,double,aggregate<double,double[3]>> sg2(sg1.getDecomposition(),sz,g); + + // create a grid iterator over a bilion point + + auto it = sg1.getGridIterator(); + + while(it.isNext()) + { + auto gkey = it.get(); + auto key = it.get_dist(); + + size_t sx = gkey.get(0) - 128; + size_t sy = gkey.get(1) - 128; + size_t sz = gkey.get(2) - 128; + + if (sx*sx + sy*sy + sz*sz < 32*32) + { + sg1.template insert<0>(key) = 1.0; + sg1.template insert<1>(key)[0] = gkey.get(0); + sg1.template insert<1>(key)[1] = gkey.get(1); + sg1.template insert<1>(key)[2] = gkey.get(2); + + sg2.template insert<0>(key) = 1.0; + sg2.template insert<1>(key)[0] = gkey.get(0); + sg2.template insert<1>(key)[1] = gkey.get(1); + sg2.template insert<1>(key)[2] = gkey.get(2); + } + + ++it; + } + + sg1.write("sg1_test"); + sg2.write("sg2_test"); + + bool test = compare("sg1_test_" + std::to_string(v_cl.rank()) + ".vtk","sg2_test_" + std::to_string(v_cl.rank()) + ".vtk"); + BOOST_REQUIRE_EQUAL(true,test); + + sg1.save("hdf5_w1_test"); + sg2.save("hdf5_w2_test"); + + // To uncomment and check +// sgrid_dist_soa<3,double,aggregate<double,double[3]>> sg1_(sz,domain,g,bc); +// sgrid_dist_id<3,double,aggregate<double,double[3]>> sg2_(sg1.getDecomposition(),sz,g); + +// sg1.load("hdf5_w1_test"); +// sg2.load("hdf5_w2_test"); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/SubdomainGraphNodes.hpp b/src/SubdomainGraphNodes.hpp index ec60446f05bad61df3b6f403da70c3ffc40e45a5..140b4a0854ea2f71b0951d9a98093e9091053d4d 100755 --- a/src/SubdomainGraphNodes.hpp +++ b/src/SubdomainGraphNodes.hpp @@ -3,7 +3,7 @@ #include <boost/fusion/container/vector.hpp> #include <boost/fusion/include/at_c.hpp> -#include "Grid/Encap.hpp" +#include "memory_ly/Encap.hpp" /* In a decomposition graph each node represent a sub-domain while an edge represent * an interaction between sub-domain (it mean that they have to communicate). diff --git a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh index cc1175f8ca48a9e905eceb5fe30e8bdfabb985bf..b1322c099d0fddd6dad79cbdd2a6394877a47efb 100644 --- a/src/Vector/cuda/vector_dist_comm_util_funcs.cuh +++ b/src/Vector/cuda/vector_dist_comm_util_funcs.cuh @@ -28,22 +28,19 @@ struct labelParticlesGhost_impl Decomposition & dec, openfpm::vector<aggregate<unsigned int,unsigned long int>, CudaMemory, - typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type, memory_traits_inte> & g_opart_device, openfpm::vector<aggregate<unsigned int>, Memory, - typename layout_base<aggregate<unsigned int>>::type, layout_base> & proc_id_out, openfpm::vector<aggregate<unsigned int>, Memory, - typename layout_base<aggregate<unsigned int>>::type, layout_base> & starts, Vcluster<Memory> & v_cl, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & prc_sz, - openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset, + openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, size_t & g_m, size_t opt) { @@ -60,22 +57,19 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru Decomposition & dec, openfpm::vector<aggregate<unsigned int,unsigned long int>, CudaMemory, - typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type, memory_traits_inte> & g_opart_device, openfpm::vector<aggregate<unsigned int>, Memory, - typename layout_base<aggregate<unsigned int>>::type, layout_base> & proc_id_out, openfpm::vector<aggregate<unsigned int>, Memory, - typename layout_base<aggregate<unsigned int>>::type, layout_base> & starts, Vcluster<Memory> & v_cl, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & prc_sz, - openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset, + openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, size_t & g_m, size_t opt) { @@ -118,7 +112,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel()); // sort particles - mergesort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); + openfpm::sort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); mem.allocate(sizeof(int)); mem.fill(0); @@ -153,7 +147,7 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1); // Here we reorder the offsets in ascending order - mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); + openfpm::sort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext()); prc_offset.template deviceToHost<0,1>(); @@ -185,10 +179,10 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru template<bool with_pos,unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> struct local_ghost_from_opart_impl { - static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc, - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, + const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, + openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t opt) { std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl; @@ -198,10 +192,10 @@ struct local_ghost_from_opart_impl template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true> { - static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc, - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, + const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, + openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t opt) { #if defined(CUDA_GPU) && defined(__NVCC__) @@ -234,14 +228,14 @@ struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true> template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda> struct local_ghost_from_dec_impl { - static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc, - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts, - openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev, - openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv, + static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, + const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, + openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, + openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, Vcluster<Memory> & v_cl, - openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, + openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, size_t opt) { @@ -253,14 +247,14 @@ struct local_ghost_from_dec_impl template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base> struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true> { - static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc, - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts, - openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev, - openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv, + static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & o_part_loc, + const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts, + openfpm::vector<Box<dim, St>,Memory,layout_base> & box_f_dev, + openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & box_f_sv, Vcluster<Memory> & v_cl, - openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + openfpm::vector<aggregate<unsigned int>,Memory,layout_base> & starts, + openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, size_t opt) { diff --git a/src/Vector/cuda/vector_dist_cuda_func_test.cu b/src/Vector/cuda/vector_dist_cuda_func_test.cu index fc9c28eea927344c47d5c289823a429e10f159ab..b0f330f6aa6c4bde9204b051ee6984fa8a5017e7 100644 --- a/src/Vector/cuda/vector_dist_cuda_func_test.cu +++ b/src/Vector/cuda/vector_dist_cuda_func_test.cu @@ -8,8 +8,7 @@ #include "Vector/cuda/vector_dist_cuda_funcs.cuh" #include "Vector/util/vector_dist_funcs.hpp" #include "Decomposition/CartDecomposition.hpp" -#include "util/cuda/scan_cuda.cuh" -#include "util/cuda/moderngpu/kernel_scan.hxx" +//#include "util/cuda/scan_cuda.cuh" #include "Vector/vector_dist.hpp" #include "util/cuda/scan_ofp.cuh" @@ -31,9 +30,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles ) for (size_t i = 0 ; i < v_prp.size() ; i++) { - v_pos.template get<0>(i)[0] = (float)rand()/RAND_MAX; - v_pos.template get<0>(i)[1] = (float)rand()/RAND_MAX; - v_pos.template get<0>(i)[2] = (float)rand()/RAND_MAX; + v_pos.template get<0>(i)[0] = (float)rand()/(float)RAND_MAX; + v_pos.template get<0>(i)[1] = (float)rand()/(float)RAND_MAX; + v_pos.template get<0>(i)[2] = (float)rand()/(float)RAND_MAX; v_prp.template get<0>(i) = i+12345; @@ -104,9 +103,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles ) v_prp.hostToDevice<0,1,2>(); // label particle processor - num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())> - <<<ite.wthr,ite.thr>>> - (box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size()); + CUDA_LAUNCH_DIM3((num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>), + ite.wthr,ite.thr, + box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size()); o_part_loc.deviceToHost<0>(); @@ -137,7 +136,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles ) starts.deviceToHost<0>(starts.size()-1,starts.size()-1); size_t tot = starts.template get<0>(o_part_loc.size()-1); - openfpm::vector<Point<3,float>,CudaMemory,typename memory_traits_inte<Point<3,float>>::type,memory_traits_inte> shifts; + openfpm::vector<Point<3,float>,CudaMemory,memory_traits_inte> shifts; shifts.resize(4); @@ -165,12 +164,12 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles ) openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> o_part_loc2; o_part_loc2.resize(tot); - shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()), + CUDA_LAUNCH_DIM3((shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()), decltype(v_pos.toKernel()),decltype(v_prp.toKernel()), decltype(starts.toKernel()),decltype(shifts.toKernel()), - decltype(o_part_loc2.toKernel())> - <<<ite.wthr,ite.thr>>> - (box_f_dev.toKernel(),box_f_sv.toKernel(), + decltype(o_part_loc2.toKernel())>), + ite.wthr,ite.thr, + box_f_dev.toKernel(),box_f_sv.toKernel(), v_pos.toKernel(),v_prp.toKernel(), starts.toKernel(),shifts.toKernel(),o_part_loc2.toKernel(),old,old); @@ -356,9 +355,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles ) ite = o_part_loc2.getGPUIterator(); - process_ghost_particles_local<true,3,decltype(o_part_loc2.toKernel()),decltype(v_pos2.toKernel()),decltype(v_prp2.toKernel()),decltype(shifts.toKernel())> - <<<ite.wthr,ite.thr>>> - (o_part_loc2.toKernel(),v_pos2.toKernel(),v_prp2.toKernel(),shifts.toKernel(),old); + CUDA_LAUNCH_DIM3((process_ghost_particles_local<true,3,decltype(o_part_loc2.toKernel()),decltype(v_pos2.toKernel()),decltype(v_prp2.toKernel()),decltype(shifts.toKernel())>), + ite.wthr,ite.thr, + o_part_loc2.toKernel(),v_pos2.toKernel(),v_prp2.toKernel(),shifts.toKernel(),old); v_pos2.template deviceToHost<0>(); v_prp2.template deviceToHost<0,1,2>(); @@ -397,7 +396,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_fill_send_buffer_test ) typedef object<typename object_creator<typename prop::type, 0,1,2>::type> prp_object; // send vector for each processor - typedef openfpm::vector<prp_object,CudaMemory,typename memory_traits_inte<prp_object>::type,memory_traits_inte> send_vector; + typedef openfpm::vector<prp_object,CudaMemory,memory_traits_inte> send_vector; openfpm::vector<send_vector> g_send_prp; @@ -462,9 +461,9 @@ BOOST_AUTO_TEST_CASE( vector_ghost_fill_send_buffer_test ) auto ite = g_send_prp.get(i).getGPUIterator(); - process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),0,1,2> - <<<ite.wthr,ite.thr>>> - (g_opart_device.toKernel(), g_send_prp.get(i).toKernel(), + CUDA_LAUNCH_DIM3((process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),0,1,2>), + ite.wthr,ite.thr, + g_opart_device.toKernel(), g_send_prp.get(i).toKernel(), v_prp.toKernel(),offset); offset += g_send_prp.get(i).size(); @@ -561,9 +560,9 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use ) for (size_t j = 0 ; j < n_part ; j++) { - vg.template get<0>(k*n_part+j)[0] = (sp.getHigh(0) - sp.getLow(0))*((float)rand()/RAND_MAX) + sp.getLow(0); - vg.template get<0>(k*n_part+j)[1] = (sp.getHigh(1) - sp.getLow(1))*((float)rand()/RAND_MAX) + sp.getLow(1); - vg.template get<0>(k*n_part+j)[2] = (sp.getHigh(2) - sp.getLow(2))*((float)rand()/RAND_MAX) + sp.getLow(2); + vg.template get<0>(k*n_part+j)[0] = (sp.getHigh(0) - sp.getLow(0))*((float)rand()/(float)RAND_MAX) + sp.getLow(0); + vg.template get<0>(k*n_part+j)[1] = (sp.getHigh(1) - sp.getLow(1))*((float)rand()/(float)RAND_MAX) + sp.getLow(1); + vg.template get<0>(k*n_part+j)[2] = (sp.getHigh(2) - sp.getLow(2))*((float)rand()/(float)RAND_MAX) + sp.getLow(2); } } @@ -578,9 +577,9 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use ) proc_id_out.template get<0>(proc_id_out.size()-1) = 0; proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1); - num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())> - <<<ite.wthr,ite.thr>>> - (dec.toKernel(),vg.toKernel(),proc_id_out.toKernel()); + CUDA_LAUNCH_DIM3((num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>), + ite.wthr,ite.thr, + dec.toKernel(),vg.toKernel(),proc_id_out.toKernel()); proc_id_out.deviceToHost<0>(); @@ -603,7 +602,6 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use ) openfpm::vector<aggregate<unsigned int>, CudaMemory, - typename memory_traits_inte<aggregate<unsigned int>>::type, memory_traits_inte> starts; starts.resize(proc_id_out.size()); @@ -618,7 +616,6 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use ) openfpm::vector<aggregate<unsigned int,long unsigned int>, CudaMemory, - typename memory_traits_inte<aggregate<unsigned int,long unsigned int>>::type, memory_traits_inte> output; output.resize(sz); @@ -626,9 +623,9 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use ) ite = vg.getGPUIterator(); // we compute processor id for each particle - proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())> - <<<ite.wthr,ite.thr>>> - (dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel()); + CUDA_LAUNCH_DIM3((proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())>), + ite.wthr,ite.thr, + dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel()); output.template deviceToHost<0,1>(); @@ -724,9 +721,9 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use ) for (size_t i = 0 ; i < 10000 ; i++) { - vg.template get<0>(i)[0] = (float)rand()/RAND_MAX; - vg.template get<0>(i)[1] = (float)rand()/RAND_MAX; - vg.template get<0>(i)[2] = (float)rand()/RAND_MAX; + vg.template get<0>(i)[0] = (float)rand()/(float)RAND_MAX; + vg.template get<0>(i)[1] = (float)rand()/(float)RAND_MAX; + vg.template get<0>(i)[2] = (float)rand()/(float)RAND_MAX; } vg.hostToDevice<0>(); @@ -739,14 +736,14 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use ) proc_id_out.resize(vg.size()); openfpm::vector_gpu<aggregate<int,int,int>> dev_counter; - dev_counter.resize(10); + dev_counter.resize(v_cl.size()); dev_counter.fill<0>(0); dev_counter.fill<1>(0); dev_counter.fill<2>(0); - process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())> - <<<ite.wthr,ite.thr>>> - (dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank()); + CUDA_LAUNCH_DIM3((process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>), + ite.wthr,ite.thr, + dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank()); proc_id_out.deviceToHost<0>(); @@ -834,7 +831,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_reorder_lbl) auto ite = lbl_p.getGPUIterator(); - reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())><<<ite.wthr,ite.thr>>>(lbl_p.toKernel(),starts.toKernel()); + CUDA_LAUNCH_DIM3((reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>),ite.wthr,ite.thr,lbl_p.toKernel(),starts.toKernel()); starts.template deviceToHost<0>(); lbl_p.template deviceToHost<0,1,2>(); @@ -909,7 +906,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) auto ite = v_pos.getGPUIterator(); - merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(), + CUDA_LAUNCH_DIM3((merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0>),ite.wthr,ite.thr,v_pos.toKernel(),v_prp.toKernel(), v_pos_out.toKernel(),v_prp_out.toKernel(), ns_to_s.toKernel()); @@ -933,7 +930,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) BOOST_REQUIRE_EQUAL(match,true); - merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(), + CUDA_LAUNCH_DIM3((merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2>),ite.wthr,ite.thr,v_pos.toKernel(),v_prp.toKernel(), v_pos_out.toKernel(),v_prp_out.toKernel(), ns_to_s.toKernel()); @@ -961,7 +958,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort) BOOST_REQUIRE_EQUAL(match,true); - merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(), + CUDA_LAUNCH_DIM3((merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())>),ite.wthr,ite.thr,v_pos.toKernel(),v_prp.toKernel(), v_pos_out.toKernel(),v_prp_out.toKernel(), ns_to_s.toKernel()); @@ -997,8 +994,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test) { openfpm::vector_gpu<aggregate<int,int>> m_opart; - openfpm::vector<openfpm::vector<Point<3,float>,CudaMemory,typename memory_traits_inte<Point<3,float>>::type,memory_traits_inte,openfpm::grow_policy_identity>> m_pos; - openfpm::vector<openfpm::vector<aggregate<float,float[2],float[3][3]>,CudaMemory,typename memory_traits_inte<aggregate<float,float[2],float[3][3]>>::type,memory_traits_inte,openfpm::grow_policy_identity>> m_prp; + openfpm::vector<openfpm::vector<Point<3,float>,CudaMemory,memory_traits_inte,openfpm::grow_policy_identity>> m_pos; + openfpm::vector<openfpm::vector<aggregate<float,float[2],float[3][3]>,CudaMemory,memory_traits_inte,openfpm::grow_policy_identity>> m_prp; openfpm::vector_gpu<Point<3,float>> v_pos; openfpm::vector_gpu<aggregate<float,float[2],float[3][3]>> v_prp; @@ -1011,22 +1008,22 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test) for (size_t i = 0 ; i < v_pos.size() ; i++) { - v_pos.template get<0>(i)[0] = (float)rand()/RAND_MAX; - v_pos.template get<0>(i)[1] = (float)rand()/RAND_MAX; - v_pos.template get<0>(i)[2] = (float)rand()/RAND_MAX; - - v_prp.template get<0>(i) = 5.0 + (float)rand()/RAND_MAX; - v_prp.template get<1>(i)[0] = 10.0 + (float)rand()/RAND_MAX; - v_prp.template get<1>(i)[1] = 11.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[0][0] = 40.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[0][1] = 50.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[0][2] = 60.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[1][0] = 70.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[1][1] = 80.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[1][2] = 150.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[2][0] = 160.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[2][1] = 170.0 + (float)rand()/RAND_MAX; - v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/RAND_MAX; + v_pos.template get<0>(i)[0] = (float)rand()/(float)RAND_MAX; + v_pos.template get<0>(i)[1] = (float)rand()/(float)RAND_MAX; + v_pos.template get<0>(i)[2] = (float)rand()/(float)RAND_MAX; + + v_prp.template get<0>(i) = 5.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<1>(i)[0] = 10.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<1>(i)[1] = 11.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[0][0] = 40.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[0][1] = 50.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[0][2] = 60.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[1][0] = 70.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[1][1] = 80.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[1][2] = 150.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[2][0] = 160.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[2][1] = 170.0 + (float)rand()/(float)RAND_MAX; + v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/(float)RAND_MAX; int seg = i / 10000; m_opart.template get<1>(i) = seg; @@ -1051,10 +1048,10 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test) { auto ite = m_pos.get(i).getGPUIterator(); - process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), - decltype(v_pos.toKernel()),decltype(v_prp.toKernel())> - <<<ite.wthr,ite.thr>>> - (m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), + CUDA_LAUNCH_DIM3((process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()), + decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>), + ite.wthr,ite.thr, + m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(), v_pos.toKernel(),v_prp.toKernel(),offset); m_pos.get(i).deviceToHost<0>(); @@ -1116,9 +1113,9 @@ void vector_dist_remove_marked_type() { auto p = it.get(); - vd.getPos(p)[0] = (float)rand() / RAND_MAX; - vd.getPos(p)[1] = (float)rand() / RAND_MAX; - vd.getPos(p)[2] = (float)rand() / RAND_MAX; + vd.getPos(p)[0] = (float)rand() / (float)RAND_MAX; + vd.getPos(p)[1] = (float)rand() / (float)RAND_MAX; + vd.getPos(p)[2] = (float)rand() / (float)RAND_MAX; ++it; } @@ -1272,7 +1269,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration_gpu ) // Distributed vector vector_dist_gpu<3,float,part_prop> vd(k,box,bc,ghost,BIND_DEC_TO_GHOST); - size_t start = vd.init_size_accum(k); +/* size_t start = vd.init_size_accum(k); auto it = vd.getIterator(); @@ -1504,7 +1501,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration_gpu ) ++p_it3; } - BOOST_REQUIRE_EQUAL(ret,true); + BOOST_REQUIRE_EQUAL(ret,true);*/ } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh index c5a19ccc3daa580a5d0093b30281d6acbde71ca1..78c4c9c5c72a508c9d2c6716643dc714c3e97e5a 100644 --- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh +++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh @@ -9,12 +9,11 @@ #define VECTOR_DIST_CUDA_FUNCS_CUH_ #include "Vector/util/vector_dist_funcs.hpp" -#include "util/cuda/moderngpu/kernel_reduce.hxx" -#include "util/cuda/moderngpu/kernel_scan.hxx" #include "Decomposition/common.hpp" #include "lib/pdata.hpp" #include "util/cuda/kernels.cuh" #include "util/cuda/scan_ofp.cuh" +#include "util/cuda/reduce_ofp.cuh" #include "memory/CudaMemory.cuh" template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type> @@ -130,6 +129,7 @@ __global__ void process_ghost_particles_prp(vector_g_opart_type g_opart, vector process_ghost_device_particle_prp<vector_g_opart_type,vector_prp_type_out,vector_prp_type_in,prp...>(i,offset,g_opart,m_prp,v_prp); } + template<typename vector_prp_type_out, typename vector_prp_type_in, unsigned int ... prp> __global__ void process_ghost_particles_prp_put(vector_prp_type_out m_prp, vector_prp_type_in v_prp, unsigned int offset) @@ -291,7 +291,7 @@ auto reduce_local(vector_type & vd) -> typename std::remove_reference<decltype(v CudaMemory mem; mem.allocate(sizeof(reduce_type)); - mgpu::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(), + openfpm::reduce((reduce_type *)vd.getPropVector(). template getDeviceBuffer<prp>(), vd.size_local(), (reduce_type *)mem.getDevicePointer() , op<reduce_type>(), vd.getVC().getmgpuContext()); diff --git a/src/Vector/cuda/vector_dist_gpu_MP_tests.cu b/src/Vector/cuda/vector_dist_gpu_MP_tests.cu index 8d9f7d825cb7de6d47e25967dd6555995f0596dd..e3bcca4cbcddc3f77d2ffe87dc80be53bc9bd464 100644 --- a/src/Vector/cuda/vector_dist_gpu_MP_tests.cu +++ b/src/Vector/cuda/vector_dist_gpu_MP_tests.cu @@ -523,25 +523,25 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test ) phases.template get<0>(2).add(); phases.template get<0>(3).add(); - phases.template get<0>(0).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.template get<0>(0).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.template get<0>(0).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.template get<0>(0).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.template get<0>(1).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.template get<0>(1).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.template get<0>(1).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.template get<0>(1).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.template get<0>(2).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.template get<0>(2).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.template get<0>(2).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.template get<0>(2).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.template get<0>(3).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.template get<0>(3).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.template get<0>(3).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.template get<0>(3).getLastPropWrite<0>() = (float)rand() / RAND_MAX; + phases.template get<0>(0).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(0).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(0).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(0).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.template get<0>(1).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(1).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(1).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(1).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.template get<0>(2).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(2).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(2).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(2).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.template get<0>(3).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(3).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(3).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.template get<0>(3).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; } phases.template get<0>(0).hostToDevicePos(); @@ -559,7 +559,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test ) openfpm::vector_gpu<aggregate<float>> output; output.resize(100 * phases.size()); - vdmkt<<<1,1>>>(phases.toKernel(),output.toKernel()); + CUDA_LAUNCH_DIM3(vdmkt,1,1,phases.toKernel(),output.toKernel()); output.template deviceToHost<0>(); @@ -615,25 +615,25 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test_simplified ) phases.get(2).add(); phases.get(3).add(); - phases.get(0).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(0).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(0).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(0).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.get(1).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(1).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(1).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(1).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.get(2).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(2).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(2).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(2).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.get(3).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(3).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(3).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(3).getLastPropWrite<0>() = (float)rand() / RAND_MAX; + phases.get(0).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(0).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(0).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(0).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.get(1).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(1).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(1).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(1).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.get(2).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(2).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(2).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(2).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.get(3).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(3).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(3).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(3).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; } phases.get(0).hostToDevicePos(); @@ -651,7 +651,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test_simplified ) openfpm::vector_gpu<aggregate<float>> output; output.resize(100 * phases.size()); - vdmkt_simple<<<1,1>>>(phases.toKernel(),output.toKernel()); + CUDA_LAUNCH_DIM3(vdmkt_simple,1,1,phases.toKernel(),output.toKernel()); output.template deviceToHost<0>(); @@ -706,25 +706,25 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_cl_test ) phases.get(2).add(); phases.get(3).add(); - phases.get(0).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(0).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(0).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(0).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.get(1).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(1).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(1).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(1).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.get(2).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(2).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(2).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(2).getLastPropWrite<0>() = (float)rand() / RAND_MAX; - - phases.get(3).getLastPosWrite()[0] = (float)rand() / RAND_MAX; - phases.get(3).getLastPosWrite()[1] = (float)rand() / RAND_MAX; - phases.get(3).getLastPosWrite()[2] = (float)rand() / RAND_MAX; - phases.get(3).getLastPropWrite<0>() = (float)rand() / RAND_MAX; + phases.get(0).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(0).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(0).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(0).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.get(1).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(1).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(1).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(1).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.get(2).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(2).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(2).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(2).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; + + phases.get(3).getLastPosWrite()[0] = (float)rand() / (float)RAND_MAX; + phases.get(3).getLastPosWrite()[1] = (float)rand() / (float)RAND_MAX; + phases.get(3).getLastPosWrite()[2] = (float)rand() / (float)RAND_MAX; + phases.get(3).getLastPropWrite<0>() = (float)rand() / (float)RAND_MAX; } // redistribute all @@ -760,7 +760,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_cl_test ) output.resize(tot); output2.resize(tot_g); - vdmkt_simple_cl<<<1,1>>>(phases.toKernel(),output.toKernel(),cl_ph.toKernel(),output2.toKernel()); + CUDA_LAUNCH_DIM3(vdmkt_simple_cl,1,1,phases.toKernel(),output.toKernel(),cl_ph.toKernel(),output2.toKernel()); output.template deviceToHost<0>(); diff --git a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu index b70bd00f9be64da890347bf1ecd91379bd1e134d..c8fcec4e295126ecc66e0f7c06a8fe537e68b3ca 100644 --- a/src/Vector/cuda/vector_dist_gpu_unit_tests.cu +++ b/src/Vector/cuda/vector_dist_gpu_unit_tests.cu @@ -224,9 +224,9 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_ghost_get ) { auto p = it.get(); - vd.getPos(p)[0] = (float)rand() / RAND_MAX; - vd.getPos(p)[1] = (float)rand() / RAND_MAX; - vd.getPos(p)[2] = (float)rand() / RAND_MAX; + vd.getPos(p)[0] = (float)rand() / (float)RAND_MAX; + vd.getPos(p)[1] = (float)rand() / (float)RAND_MAX; + vd.getPos(p)[2] = (float)rand() / (float)RAND_MAX; vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2]; @@ -349,9 +349,9 @@ void vector_dist_gpu_test_impl() int y = rand(); int z = rand(); - vd.getPos(p)[0] = (float)x / RAND_MAX; - vd.getPos(p)[1] = (float)y / RAND_MAX; - vd.getPos(p)[2] = (float)z / RAND_MAX; + vd.getPos(p)[0] = (float)x / (float)RAND_MAX; + vd.getPos(p)[1] = (float)y / (float)RAND_MAX; + vd.getPos(p)[2] = (float)z / (float)RAND_MAX; Point<3,float> xp = vd.getPos(p); @@ -396,7 +396,7 @@ void vector_dist_gpu_test_impl() // offload to device vd.hostToDevicePos(); - initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel()); + CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel()); // now we check what we initialized @@ -468,9 +468,9 @@ void vector_dist_gpu_make_sort_test_impl() int y = rand(); int z = rand(); - vd.getPos(p)[0] = (float)x / RAND_MAX; - vd.getPos(p)[1] = (float)y / RAND_MAX; - vd.getPos(p)[2] = (float)z / RAND_MAX; + vd.getPos(p)[0] = (float)x / (float)RAND_MAX; + vd.getPos(p)[1] = (float)y / (float)RAND_MAX; + vd.getPos(p)[2] = (float)z / (float)RAND_MAX; ++it; } @@ -482,7 +482,7 @@ void vector_dist_gpu_make_sort_test_impl() auto it3 = vd.getDomainIteratorGPU(); - initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel()); + CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel()); // Here we check make sort does not mess-up particles we use a Cell-List to check that // the two cell-list constructed are identical @@ -603,9 +603,9 @@ void vdist_calc_gpu_test() { auto p = it.get(); - vd.getPos(p)[0] = (St)rand() / RAND_MAX; - vd.getPos(p)[1] = (St)rand() / RAND_MAX; - vd.getPos(p)[2] = (St)rand() / RAND_MAX; + vd.getPos(p)[0] = (St)rand() / (float)RAND_MAX; + vd.getPos(p)[1] = (St)rand() / (float)RAND_MAX; + vd.getPos(p)[2] = (St)rand() / (float)RAND_MAX; vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2]; @@ -708,8 +708,6 @@ void vdist_calc_gpu_test() { vd.map(RUN_ON_DEVICE); - CUDA_SAFE(cudaGetLastError()); - vd.deviceToHostPos(); vd.template deviceToHostProp<0,1,2>(); @@ -854,7 +852,7 @@ void vdist_calc_gpu_test() // move particles on gpu auto ite = vd.getDomainIteratorGPU(); - move_parts_gpu_test<3,decltype(vd.toKernel())><<<ite.wthr,ite.thr>>>(vd.toKernel()); + CUDA_LAUNCH_DIM3((move_parts_gpu_test<3,decltype(vd.toKernel())>),ite.wthr,ite.thr,vd.toKernel()); } } @@ -933,14 +931,14 @@ void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut) { std::random_device r; - std::seed_seq seed2{r() + create_vcluster().rank(), - r() + create_vcluster().rank(), - r() + create_vcluster().rank(), - r() + create_vcluster().rank(), - r() + create_vcluster().rank(), - r() + create_vcluster().rank(), - r() + create_vcluster().rank(), - r() + create_vcluster().rank()}; + std::seed_seq seed2{/*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank(), + /*r() +*/ create_vcluster().rank()}; std::mt19937 e2(seed2); typedef vector_dist_gpu<3,double,aggregate<double,double[3],double[3]>> vector_type; @@ -1055,6 +1053,7 @@ void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut) vd.hostToDevicePos(); vd.map(RUN_ON_DEVICE); vd.template ghost_get<0>(RUN_ON_DEVICE); + vd.deviceToHostPos(); vd.template deviceToHostProp<0,1,2>(); @@ -1102,7 +1101,11 @@ void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut) double load_f = load; double load_fc = loads.get(i); +#ifdef ENABLE_ASAN + BOOST_REQUIRE_CLOSE(load_f,load_fc,30.0); +#else BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0); +#endif } } } @@ -1311,7 +1314,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda2) if (create_vcluster().size() <= 3) {return;}; + #ifndef CUDA_ON_CPU vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(1000000,0.01); + #endif } BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3) @@ -1319,7 +1324,9 @@ BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3) if (create_vcluster().size() < 8) {return;} + #ifndef CUDA_ON_CPU vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(15000000,0.005); + #endif } @@ -1847,6 +1854,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1) } + BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) { Vcluster<> & v_cl = create_vcluster(); @@ -1878,7 +1886,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) // ghost Ghost<3,float> ghost(r_g); - typedef aggregate<float> part_prop; + typedef aggregate<float,float,float> part_prop; // Distributed vector vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost); @@ -1899,16 +1907,18 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) vd.getLastPropWrite<0>() = 0.0; + vd.getLastPropWrite<2>() = 0.0; + ++it; } vd.map(); vd.hostToDevicePos(); - vd.template hostToDeviceProp<0>(); + vd.template hostToDeviceProp<0,2>(); // sync the ghost - vd.ghost_get<0>(RUN_ON_DEVICE); - vd.template deviceToHostProp<0>(); + vd.ghost_get<0,2>(RUN_ON_DEVICE); + vd.template deviceToHostProp<0,2>(); vd.deviceToHostPos(); { @@ -1937,7 +1947,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) float dist = xp.distance(xq); if (dist < r_cut) + { vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut); + vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut) / 2; + } ++Np; } @@ -1946,25 +1959,27 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) } vd.hostToDevicePos(); - vd.template hostToDeviceProp<0>(); - vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE); - vd.template deviceToHostProp<0>(); + vd.template hostToDeviceProp<0,2>(); + vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE); + vd.template deviceToHostProp<0,2>(); vd.deviceToHostPos(); bool ret = true; auto it3 = vd.getDomainIterator(); float constant = vd.getProp<0>(it3.get()); + float constanta = vd.getProp<2>(it3.get()); float eps = 0.001; while (it3.isNext()) { float constant2 = vd.getProp<0>(it3.get()); - if (fabs(constant - constant2)/constant > eps) + float constant3 = vd.getProp<2>(it3.get()); + if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps) { Point<3,float> p = vd.getPosRead(it3.get()); - std::cout << p.toString() << " " << constant2 << "/" << constant << " " << v_cl.getProcessUnitID() << std::endl; + std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl; ret = false; break; } @@ -1980,6 +1995,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) auto key = itp.get(); vd.getPropWrite<0>(key) = 0.0; + vd.getPropWrite<2>(key) = 0.0; ++itp; } @@ -2010,7 +2026,10 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) float dist = xp.distance(xq); if (dist < r_cut) + { vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut); + vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut); + } ++Np; } @@ -2019,25 +2038,28 @@ BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) } vd.hostToDevicePos(); - vd.template hostToDeviceProp<0>(); + vd.template hostToDeviceProp<0,2>(); vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE); - vd.template deviceToHostProp<0>(); + vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE); + vd.template deviceToHostProp<0,2>(); vd.deviceToHostPos(); bool ret = true; auto it3 = vd.getDomainIterator(); float constant = vd.getPropRead<0>(it3.get()); + float constanta = vd.getPropRead<2>(it3.get()); float eps = 0.001; while (it3.isNext()) { float constant2 = vd.getPropRead<0>(it3.get()); - if (fabs(constant - constant2)/constant > eps) + float constant3 = vd.getPropRead<0>(it3.get()); + if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps) { Point<3,float> p = vd.getPosRead(it3.get()); - std::cout << p.toString() << " " << constant2 << "/" << constant << " " << v_cl.getProcessUnitID() << std::endl; + std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl; ret = false; break; } diff --git a/src/Vector/tests/vector_dist_cell_list_tests.cpp b/src/Vector/tests/vector_dist_cell_list_tests.cpp index 028f5beeb55973533a3b72d4d331abedeb1c1893..b6c2e7789fd55f0623e1c44139ebd505ee76845c 100644 --- a/src/Vector/tests/vector_dist_cell_list_tests.cpp +++ b/src/Vector/tests/vector_dist_cell_list_tests.cpp @@ -607,8 +607,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_symmetric_cell_list ) vd.ghost_put<add_,1>(); vd.ghost_put<merge_,4>(); - vd.write("DEBUG"); - auto p_it3 = vd.getDomainIterator(); bool ret = true; diff --git a/src/Vector/tests/vector_dist_dlb_test.hpp b/src/Vector/tests/vector_dist_dlb_test.hpp index 97b4dcee0f2ff3a773d1d4b2bc114bdb3edcc0b7..df936a3d164744edb7645b46bd27b4cdb60b6b34 100644 --- a/src/Vector/tests/vector_dist_dlb_test.hpp +++ b/src/Vector/tests/vector_dist_dlb_test.hpp @@ -251,7 +251,6 @@ template<typename vector_type> void test_dlb_vector() } vd.map(); - vd.template ghost_get<>(); // Get the neighborhood of each particles @@ -411,7 +410,7 @@ template<typename vector_type> void test_dlb_multi_phase_v_vector() mp_test_template(vd0,vd1,vd2,vd3); } -BOOST_AUTO_TEST_CASE( vector_dist_dlb_test_part ) +BOOST_AUTO_TEST_CASE( vector_dist_dlb ) { test_dlb_vector<vector_dist<3,double,aggregate<double>>>(); } diff --git a/src/Vector/util/vector_dist_funcs.hpp b/src/Vector/util/vector_dist_funcs.hpp index 35f001aeb675eb1a715f9bb39ba4501c96c0d496..6d86d44779dbd1c6d14900cd05f03d0f9f5da590 100644 --- a/src/Vector/util/vector_dist_funcs.hpp +++ b/src/Vector/util/vector_dist_funcs.hpp @@ -116,6 +116,7 @@ __device__ inline void process_map_device_particle(unsigned int i, unsigned int proc_class::proc(i,id,v_prp,m_prp); } + //! It process one particle template<typename Top, typename T2, typename T4, unsigned int ... prp> __device__ inline void process_ghost_device_particle_prp(unsigned int i, unsigned int offset, Top & g_opart, T2 & m_prp, T4 & v_prp) diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp index df61e151a2a128d4e4d1b69629968b73dd859935..66ac010b09e1039abcd0c19974cb23f9c1c93a05 100644 --- a/src/Vector/vector_dist.hpp +++ b/src/Vector/vector_dist.hpp @@ -9,6 +9,7 @@ #define VECTOR_HPP_ #include "config.h" +#include "util/cuda_launch.hpp" #include "HDF5_wr/HDF5_wr.hpp" #include "VCluster/VCluster.hpp" #include "Space/Shape/Point.hpp" @@ -280,17 +281,17 @@ private: //! Particle position vector, (It has 2 elements) the first has real particles assigned to a processor //! the second element contain unassigned particles - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos; + openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos; //! Particle properties vector, (It has 2 elements) the first has real particles assigned to a processor //! the second element contain unassigned particles - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp; + openfpm::vector<prop,Memory,layout_base> v_prp; //! reordered v_pos buffer - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp_out; + openfpm::vector<prop,Memory,layout_base> v_prp_out; //! reordered v_prp buffer - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos_out; + openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_out; //! option used to create this vector size_t opt = 0; @@ -550,6 +551,35 @@ public: #endif } + /*! \brief Constructor of a distributed vector + * + * \param np number of elements + * \param box domain where the vector of elements live + * \param bc boundary conditions + * \param g Ghost margins + * \param opt [Optional] additional options. BIND_DEC_TO_GHOST Bind the decomposition to be multiple of the + * ghost size. This is required if we want to use symmetric to eliminate + * ghost communications. + * \param gdist [Optional] override the default distribution grid + * + */ + vector_dist(size_t np, Box<dim, St> box, const size_t (&bc)[dim], const Ghost<dim, St> & g, const grid_sm<dim,void> & gdist) + :opt(0) SE_CLASS3_VDIST_CONSTRUCTOR + { + if (opt >> 32 != 0) + {this->setDecompositionGranularity(opt >> 32);} + + check_parameters(box); + + init_structures(np); + + this->init_decomposition_gr_cell(box,bc,g,opt,gdist); + + +#ifdef SE_CLASS3 + se3.Initialize(); +#endif + } /*! \brief Constructor of a distributed vector * @@ -2074,7 +2104,7 @@ public: * \return an iterator * */ - ite_gpu<1> getDomainIteratorGPU(size_t n_thr = 1024) const + ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const { #ifdef SE_CLASS3 se3.getIterator(); @@ -2088,7 +2118,7 @@ public: * \return an iterator * */ - ite_gpu<1> getDomainAndGhostIteratorGPU(size_t n_thr = 1024) const + ite_gpu<1> getDomainAndGhostIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const { #ifdef SE_CLASS3 se3.getIterator(); @@ -2103,7 +2133,7 @@ public: * */ template<unsigned int ... prp,typename id_1, typename id_2, bool is_sparse> - void merge_sort(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>,id_1,id_2,is_sparse> & cl, size_t n_thr = 1024) + void merge_sort(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>,id_1,id_2,is_sparse> & cl, size_t n_thr = default_kernel_wg_threads_) { #if defined(__NVCC__) @@ -2190,7 +2220,7 @@ public: * \parameter Cell-list from which has been constructed the sorted vector * */ - template<unsigned int ... prp> void merge_sort_with_pos(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> & cl, size_t n_thr = 1024) + template<unsigned int ... prp> void merge_sort_with_pos(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> & cl, size_t n_thr = default_kernel_wg_threads_) { #if defined(__NVCC__) @@ -2212,7 +2242,7 @@ public: * \return an iterator * */ - auto getDomainIteratorDevice(size_t n_thr = 1024) const -> decltype(this->getDomainIteratorGPU(n_thr)) + auto getDomainIteratorDevice(size_t n_thr = default_kernel_wg_threads_) const -> decltype(this->getDomainIteratorGPU(n_thr)) { return this->getDomainIteratorGPU(n_thr); } @@ -2225,7 +2255,7 @@ public: * \return an iterator * */ - auto getDomainIteratorDevice(size_t n_thr = 1024) const -> decltype(this->getDomainIterator()) + auto getDomainIteratorDevice(size_t n_thr = default_kernel_wg_threads_) const -> decltype(this->getDomainIterator()) { return this->getDomainIterator(); } @@ -2657,8 +2687,8 @@ public: if ((opt & 0x0FFF0000) == CSV_WRITER) { // CSVWriter test - CSVWriter<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> > csv_writer; + CSVWriter<openfpm::vector<Point<dim, St>,Memory,layout_base>, + openfpm::vector<prop,Memory,layout_base> > csv_writer; std::string output = std::to_string(out + "_" + std::to_string(v_cl.getProcessUnitID()) + std::to_string(".csv")); @@ -2673,8 +2703,8 @@ public: ft = file_type::BINARY; // VTKWriter for a set of points - VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>>, + VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,layout_base>, + openfpm::vector<prop,Memory,layout_base>>, VECTOR_POINTS> vtk_writer; vtk_writer.add(v_pos,v_prp,g_m); @@ -2753,8 +2783,8 @@ public: if ((opt & 0x0FFF0000) == CSV_WRITER) { // CSVWriter test - CSVWriter<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> > csv_writer; + CSVWriter<openfpm::vector<Point<dim, St>,Memory,layout_base>, + openfpm::vector<prop,Memory,layout_base> > csv_writer; std::string output = std::to_string(out + "_" + std::to_string(v_cl.getProcessUnitID()) + "_" + std::to_string(iteration) + std::to_string(".csv")); @@ -2769,8 +2799,8 @@ public: ft = file_type::BINARY; // VTKWriter for a set of points - VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>>, VECTOR_POINTS> vtk_writer; + VTKWriter<boost::mpl::pair<openfpm::vector<Point<dim, St>,Memory,layout_base>, + openfpm::vector<prop,Memory,layout_base>>, VECTOR_POINTS> vtk_writer; vtk_writer.add(v_pos,v_prp,g_m); std::string output = std::to_string(out + "_" + std::to_string(v_cl.getProcessUnitID()) + "_" + std::to_string(iteration) + std::to_string(".vtp")); @@ -2842,7 +2872,7 @@ public: * \return the particle position vector * */ - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVector() const + const openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVector() const { return v_pos; } @@ -2852,7 +2882,7 @@ public: * \return the particle position vector * */ - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVector() + openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVector() { return v_pos; } @@ -2862,7 +2892,7 @@ public: * \return the particle property vector * */ - const openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVector() const + const openfpm::vector<prop,Memory,layout_base> & getPropVector() const { return v_prp; } @@ -2872,7 +2902,7 @@ public: * \return the particle property vector * */ - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVector() + openfpm::vector<prop,Memory,layout_base> & getPropVector() { return v_prp; } @@ -2882,7 +2912,7 @@ public: * \return the particle position vector * */ - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVectorSort() const + const openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVectorSort() const { return v_pos_out; } @@ -2892,7 +2922,7 @@ public: * \return the particle position vector * */ - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & getPosVectorSort() + openfpm::vector<Point<dim, St>,Memory,layout_base> & getPosVectorSort() { return v_pos_out; } @@ -2902,7 +2932,7 @@ public: * \return the particle property vector * */ - const openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVectorSort() const + const openfpm::vector<prop,Memory,layout_base> & getPropVectorSort() const { return v_prp_out; } @@ -2912,7 +2942,7 @@ public: * \return the particle property vector * */ - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & getPropVectorSort() + openfpm::vector<prop,Memory,layout_base> & getPropVectorSort() { return v_prp_out; } diff --git a/src/Vector/vector_dist_comm.hpp b/src/Vector/vector_dist_comm.hpp index 6eeb9179f5f77b06f80dc7d97ec9dbc53bac2a2c..f31f4538a454f85851b1a2c221a033e8190b6dfe 100644 --- a/src/Vector/vector_dist_comm.hpp +++ b/src/Vector/vector_dist_comm.hpp @@ -11,9 +11,7 @@ #define TEST1 #if defined(CUDA_GPU) && defined(__NVCC__) -#include "util/cuda/moderngpu/kernel_mergesort.hxx" #include "Vector/cuda/vector_dist_cuda_funcs.cuh" -#include "util/cuda/moderngpu/kernel_scan.hxx" #include "util/cuda/kernels.cuh" #endif @@ -351,7 +349,7 @@ class vector_dist_comm size_t v_sub_unit_factor = 64; //! definition of the send vector for position - typedef openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity> send_pos_vector; + typedef openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity> send_pos_vector; //! VCluster Vcluster<Memory> & v_cl; @@ -369,7 +367,6 @@ class vector_dist_comm //! third id is the processor id openfpm::vector<aggregate<int,int,int>, Memory, - typename layout_base<aggregate<int,int,int>>::type, layout_base > m_opart; //! Per processor ordered particles id for ghost_get (see prc_g_opart) @@ -380,14 +377,13 @@ class vector_dist_comm //! Same as g_opart but on device, the vector of vector is flatten into a single vector openfpm::vector<aggregate<unsigned int,unsigned long int>, CudaMemory, - typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type, memory_traits_inte> g_opart_device; //! Helper buffer for computation (on GPU) of local particles (position) - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos_tmp; + openfpm::vector<Point<dim, St>,Memory,layout_base> v_pos_tmp; //! Helper buffer for computation (on GPU) of local particles (properties) - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp_tmp; + openfpm::vector<prop,Memory,layout_base> v_prp_tmp; //! Per processor number of particle g_opart_sz.get(i) = g_opart.get(i).size() openfpm::vector<size_t> g_opart_sz; @@ -426,17 +422,15 @@ class vector_dist_comm //! temporary buffer to processors ids openfpm::vector<aggregate<unsigned int>, Memory, - typename layout_base<aggregate<unsigned int>>::type, layout_base> proc_id_out; //! temporary buffer for the scan result openfpm::vector<aggregate<unsigned int>, Memory, - typename layout_base<aggregate<unsigned int>>::type, layout_base> starts; //! Processor communication size - openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_offset; + openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_offset; //! Temporary CudaMemory to do stuff @@ -512,7 +506,7 @@ class vector_dist_comm * \param prc_r processor ids * */ - inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_sz, + inline void calc_send_buffers(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz, openfpm::vector<size_t> & prc_sz_r, openfpm::vector<size_t> & prc_r, size_t opt) @@ -575,17 +569,17 @@ class vector_dist_comm openfpm::vector_std<openfpm::vector_std<Box<dim, St>>> box_f; //! The boxes touching the border of the domain + shift vector linearized from where they come from - openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> box_f_dev; - openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> box_f_sv; + openfpm::vector<Box<dim, St>,Memory,layout_base> box_f_dev; + openfpm::vector<aggregate<unsigned int>,Memory,layout_base> box_f_sv; //! Store the sector for each group (previous vector) openfpm::vector_std<comb<dim>> box_cmb; //! Id of the local particle to replicate for ghost_get - openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> o_part_loc; + openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> o_part_loc; //! Processor communication size - openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_sz; + openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> prc_sz; /*! \brief For every internal ghost box we create a structure that order such internal local ghost box in * shift vectors @@ -680,12 +674,12 @@ class vector_dist_comm * \param opt options * */ - void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void local_ghost_from_opart(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t opt) { // get the shift vectors - const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors(); + const openfpm::vector<Point<dim, St>,Memory,layout_base> & shifts = dec.getShiftVectors(); if (!(opt & NO_POSITION)) { @@ -737,14 +731,14 @@ class vector_dist_comm * \param g_m ghost marker * */ - void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void local_ghost_from_dec(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t g_m,size_t opt) { o_part_loc.clear(); // get the shift vectors - const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors(); + const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors(); if (opt & RUN_ON_DEVICE) { @@ -848,8 +842,8 @@ class vector_dist_comm * \param opt options * */ - void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp , + void add_loc_particles_bc(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp , size_t & g_m, size_t opt) { @@ -876,14 +870,14 @@ class vector_dist_comm * \param g_pos_send Send buffer to fill * */ - void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, + void fill_send_ghost_pos_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, openfpm::vector<size_t> & prc_sz, openfpm::vector<send_pos_vector> & g_pos_send, size_t opt, bool async) { // get the shift vectors - const openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts = dec.getShiftVectors(); + const openfpm::vector<Point<dim,St>,Memory,layout_base> & shifts = dec.getShiftVectors(); // create a number of send buffers equal to the near processors g_pos_send.resize(prc_sz.size()); @@ -965,7 +959,7 @@ class vector_dist_comm * */ template<typename send_vector, typename prp_object, int ... prp> - void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void fill_send_ghost_put_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<send_vector> & g_send_prp, size_t & g_m, size_t opt) @@ -1040,9 +1034,9 @@ class vector_dist_comm for (size_t j = accum; j < accum + n_part_recv; j++) { // source object type - typedef encapc<1, prop, typename openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>::layout_type> encap_src; + typedef encapc<1, prop, typename openfpm::vector<prop,Memory,layout_base>::layout_type> encap_src; // destination object type - typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base>::layout_type> encap_dst; + typedef encapc<1, prp_object, typename openfpm::vector<prp_object,Memory,layout_base>::layout_type> encap_dst; // Copy only the selected properties object_si_d<encap_src, encap_dst, OBJ_ENCAP, prp...>(v_prp.get(j), g_send_prp.get(i).get(j2)); @@ -1154,7 +1148,7 @@ class vector_dist_comm * */ template<typename send_vector, typename prp_object, int ... prp> - void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void fill_send_ghost_prp_buf(openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<size_t> & prc_sz, openfpm::vector<send_vector> & g_send_prp, size_t opt) @@ -1245,13 +1239,13 @@ class vector_dist_comm * This parameter is used only in case of RUN_ON_DEVICE option * */ - void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void fill_send_map_buf(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<size_t> & prc_sz_r, openfpm::vector<size_t> & prc_r, - openfpm::vector<openfpm::vector<Point<dim,St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>> & m_pos, - openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>> & m_prp, - openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> & prc_sz, + openfpm::vector<openfpm::vector<Point<dim,St>,Memory,layout_base,openfpm::grow_policy_identity>> & m_pos, + openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> & m_prp, + openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,layout_base> & prc_sz, size_t opt) { m_prp.resize(prc_sz_r.size()); @@ -1397,7 +1391,7 @@ class vector_dist_comm */ template<typename prp_object,int ... prp> void fill_send_map_buf_list(openfpm::vector<Point<dim, St>> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<size_t> & prc_sz_r, openfpm::vector<openfpm::vector<Point<dim,St>>> & m_pos, openfpm::vector<openfpm::vector<prp_object>> & m_prp) @@ -1438,12 +1432,11 @@ class vector_dist_comm * \param opt options * */ - template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, + template<typename obp> void labelParticleProcessor(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, openfpm::vector<aggregate<int,int,int>, Memory, - typename layout_base<aggregate<int,int,int>>::type, layout_base> & lbl_p, - openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_sz, + openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_sz, size_t opt) { if (opt == RUN_ON_DEVICE) @@ -1601,11 +1594,11 @@ class vector_dist_comm * \param opt ghost_get options * */ - void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void labelParticlesGhost(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, openfpm::vector<size_t> & prc, openfpm::vector<size_t> & prc_sz, - openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset, + openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> & prc_offset, size_t & g_m, size_t opt) { @@ -1814,6 +1807,31 @@ public: dec.decompose(); } + /*! \brief Initialize the decomposition + * + * \param box domain + * \param bc boundary conditions + * \param g ghost extension + * \param opt additional options + * + */ + void init_decomposition_gr_cell(Box<dim,St> & box, + const size_t (& bc)[dim], + const Ghost<dim,St> & g, + size_t opt, + const grid_sm<dim,void> & gdist) + { + size_t div[dim]; + + for (size_t i = 0 ; i < dim ; i++) + {div[i] = gdist.size(i);} + + // Create the sub-domains + dec.setParameters(div, box, bc, g); + + dec.decompose(); + } + /*! \brief It synchronize the properties and position of the ghost particles * * \tparam prp list of properties to get synchronize @@ -1824,8 +1842,8 @@ public: * \param g_m marker between real and ghost particles * */ - template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + template<unsigned int impl, int ... prp> inline void ghost_get_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, size_t opt = WITH_POSITION) { @@ -1837,7 +1855,7 @@ public: typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; // send vector for each processor - typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base,openfpm::grow_policy_identity> send_vector; + typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector; if (!(opt & NO_POSITION)) {v_pos.resize(g_m);} @@ -1911,8 +1929,8 @@ public: * \param g_m marker between real and ghost particles * */ - template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + template<int ... prp> inline void ghost_wait_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, size_t opt = WITH_POSITION) { @@ -1920,7 +1938,7 @@ public: typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; // send vector for each processor - typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base,openfpm::grow_policy_identity> send_vector; + typedef openfpm::vector<prp_object,Memory,layout_base,openfpm::grow_policy_identity> send_vector; // Send and receive ghost particle information openfpm::vector<send_vector> g_send_prp; @@ -1962,7 +1980,7 @@ public: typedef KillParticle obp; // Processor communication size - openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> prc_sz(v_cl.getProcessingUnits()); + openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,layout_base> prc_sz(v_cl.getProcessingUnits()); // map completely reset the ghost part v_pos.resize(g_m); @@ -2028,8 +2046,8 @@ public: * */ template<typename obp = KillParticle> - void map_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, size_t & g_m, + void map_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, size_t opt) { #ifdef PROFILE_SCOREP @@ -2053,9 +2071,9 @@ public: calc_send_buffers(prc_sz,prc_sz_r,prc_r,opt); //! position vector - openfpm::vector<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>> m_pos; + openfpm::vector<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>> m_pos; //! properties vector - openfpm::vector<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>> m_prp; + openfpm::vector<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>> m_prp; fill_send_map_buf(v_pos,v_prp, prc_sz_r,prc_r, m_pos, m_prp,prc_sz,opt); @@ -2071,13 +2089,13 @@ public: #endif } - v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base,openfpm::grow_policy_identity>, - openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base>, + v_cl.template SSendRecv<openfpm::vector<Point<dim, St>,Memory,layout_base,openfpm::grow_policy_identity>, + openfpm::vector<Point<dim, St>,Memory,layout_base>, layout_base> (m_pos,v_pos,prc_r,prc_recv_map,recv_sz_map,opt_); - v_cl.template SSendRecv<openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base,openfpm::grow_policy_identity>, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base>, + v_cl.template SSendRecv<openfpm::vector<prop,Memory,layout_base,openfpm::grow_policy_identity>, + openfpm::vector<prop,Memory,layout_base>, layout_base> (m_prp,v_prp,prc_r,prc_recv_map,recv_sz_map,opt_); @@ -2146,8 +2164,8 @@ public: * */ template<template<typename,typename> class op, int ... prp> - void ghost_put_(openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim, St>>::type,layout_base> & v_pos, - openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, + void ghost_put_(openfpm::vector<Point<dim, St>,Memory,layout_base> & v_pos, + openfpm::vector<prop,Memory,layout_base> & v_prp, size_t & g_m, size_t opt) { @@ -2155,7 +2173,7 @@ public: typedef object<typename object_creator<typename prop::type, prp...>::type> prp_object; // send vector for each processor - typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base> send_vector; + typedef openfpm::vector<prp_object,Memory,layout_base> send_vector; openfpm::vector<send_vector> g_send_prp; fill_send_ghost_put_prp_buf<send_vector, prp_object, prp...>(v_prp,g_send_prp,g_m,opt); diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp index 435515570da202555cb4e034bf2a1e2d16807b8e..9e49687b9a7ce2a5d426ca33de0bc5aabe61b244 100644 --- a/src/Vector/vector_dist_kernel.hpp +++ b/src/Vector/vector_dist_kernel.hpp @@ -257,7 +257,7 @@ public: * \return an iterator * */ - __host__ ite_gpu<1> getDomainIteratorGPU(size_t n_thr = 1024) const + __host__ ite_gpu<1> getDomainIteratorGPU(size_t n_thr = default_kernel_wg_threads_) const { return v_pos.getGPUIteratorTo(g_m,n_thr); } diff --git a/src/cmake/openfpmConfig-configure.cmake b/src/cmake/openfpmConfig-configure.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c7b0f8a11888857afc304257b4fa54da81c57c88 --- /dev/null +++ b/src/cmake/openfpmConfig-configure.cmake @@ -0,0 +1,4 @@ +get_property(OPENFPM_INCLUDES TARGET openfpm::binary_config PROPERTY INTERFACE_INCLUDE_DIRECTORIES) +get_property(OPENFPM_DEFINITION TARGET openfpm::binary_config PROPERTY INTERFACE_COMPILE_DEFINITIONS) +get_property(OPENFPM_LIBS TARGET openfpm::binary_config PROPERTY INTERFACE_LINK_LIBRARIES) +get_property(OPENFPM_COMPILE_OPTIONS TARGET openfpm::binary_config PROPERTY INTERFACE_COMPILE_OPTIONS) diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in index c38415e8e75b5cbc443ac8ed6eafc387f97559e2..ef3eb90ec3ed96273f9a864c41a8a404171b104e 100644 --- a/src/config/config_cmake.h.in +++ b/src/config/config_cmake.h.in @@ -4,6 +4,13 @@ ${DEFINE_COVERTY_SCAN} /* GPU support */ ${DEFINE_CUDA_GPU} +/* HIP GPU support */ +${DEFINE_HIP_GPU} + +/* HIP Cudify GPU support */ +${DEFINE_CUDIFY_USE_HIP} + + /* Debug */ ${DEFINE_DEBUG} /**/ @@ -32,6 +39,12 @@ ${DEFINE_HAVE_BOOST_PROGRAM_OPTIONS} /**/ /* define if the Boost::Unit_Test_Framework library is available */ ${DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK} /**/ +/* define if the Boost::Context library is available */ +${DEFINE_HAVE_BOOST_CONTEXT} /**/ + +/* define if the Boost::Fiber library is available */ +${DEFINE_HAVE_BOOST_FIBER} /**/ + /* Have clock time */ ${DEFINE_HAVE_CLOCK_GETTIME} /**/ @@ -116,6 +129,19 @@ ${DEFINE_ACTION_ON_ERROR} /* NVCC compiling */ ${DEFINE_NVCC} /**/ +/* Define if we have Alpaka */ +${DEFINE_HAVE_ALPAKA} + +/* Additional alpaka definitions */ +${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEF} +${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEF} +${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEF} +${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEF} +${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEF} +${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEF} +${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEF} + + /* Name of package */ #define PACKAGE "openfpm_pdata" @@ -155,6 +181,12 @@ ${DEFINE_STDC_HEADERS} /* If an error occur stop the program */ ${DEFINE_STOP_ON_ERROR} +/* Garbage injector*/ +${DEFINE_GARBAGE_INJECTOR} + +/* VCluster Garbage injector*/ +${DEFINE_VCLUSTER_GARBAGE_INJECTOR} + /* Test coverage mode */ ${DEFINE_TEST_COVERAGE_MODE} diff --git a/src/isolation.cu b/src/isolation.cu new file mode 100644 index 0000000000000000000000000000000000000000..29b113100fc359acacfe4832602f3ce7a597f1ca --- /dev/null +++ b/src/isolation.cu @@ -0,0 +1,270 @@ +#include <iostream> +#include <thread> + +size_t debug_tot_call = 0; + +#define PRINT_STACKTRACE +#define CHECKFOR_POSNAN +#define CHECKFOR_POSINF +#define CHECKFOR_PROPNAN +#define CHECKFOR_PROPINF + +#define NO_WARNING +#include "Graph/CartesianGraphFactory.hpp" + +void timeout_cycle() +{ + // 6 seconds + std::this_thread::sleep_for (std::chrono::seconds(900)); + + std::cout << "Time Out" << std::endl; + std::exit(1); +} + + +#define BOOST_DISABLE_ASSERTS + + +#include "config.h" +#undef VERSION + +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include "VCluster/VCluster.hpp" +#include <Vector/vector_dist.hpp> +#include "Vector/tests/vector_dist_util_unit_tests.hpp" + +// initialization function: +bool init_unit_test() +{ +// std::thread to (timeout_cycle); +// to.detach(); + return true; +} + +// entry point +int main(int argc, char* argv[]) +{ + openfpm_init(&argc,&argv); + + return boost::unit_test::unit_test_main( &init_unit_test, argc, argv ); +} + + + +BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu ) +{ + + + Vcluster<> & v_cl = create_vcluster(); + + long int k = 25*25*25*create_vcluster().getProcessingUnits(); + k = std::pow(k, 1/3.); + + if (v_cl.getProcessingUnits() > 48) + return; + + BOOST_TEST_CHECKPOINT( "Testing 3D periodic ghost put k=" << k ); + + long int big_step = k / 30; + big_step = (big_step == 0)?1:big_step; + long int small_step = 21; + + // 3D test + for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step ) + { + float r_cut = 1.3 / k; + float r_g = 1.5 / k; + + Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0}); + + // Boundary conditions + size_t bc[3]={PERIODIC,PERIODIC,PERIODIC}; + + // ghost + Ghost<3,float> ghost(r_g); + + typedef aggregate<float,float,float> part_prop; + + // Distributed vector + vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost); + + auto it = vd.getGridIterator({(size_t)k,(size_t)k,(size_t)k}); + + while (it.isNext()) + { + auto key = it.get(); + + vd.add(); + + vd.getLastPosWrite()[0] = key.get(0)*it.getSpacing(0); + vd.getLastPosWrite()[1] = key.get(1)*it.getSpacing(1); + vd.getLastPosWrite()[2] = key.get(2)*it.getSpacing(2); + + // Fill some properties randomly + + vd.getLastPropWrite<0>() = 0.0; + + vd.getLastPropWrite<2>() = 0.0; + + ++it; + } + + vd.map(); + + vd.hostToDevicePos(); + vd.template hostToDeviceProp<0,2>(); + // sync the ghost + vd.ghost_get<0,2>(RUN_ON_DEVICE); + vd.template deviceToHostProp<0,2>(); + vd.deviceToHostPos(); + + { + auto NN = vd.getCellList(r_cut); + float a = 1.0f*k*k; + + // run trough all the particles + ghost + + auto it2 = vd.getDomainIterator(); + + while (it2.isNext()) + { + // particle p + auto p = it2.get(); + Point<3,float> xp = vd.getPos(p); + + // Get an iterator over the neighborhood particles of p + auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp)); + + // For each neighborhood particle ... + while (Np.isNext()) + { + auto q = Np.get(); + Point<3,float> xq = vd.getPosRead(q); + + float dist = xp.distance(xq); + + if (dist < r_cut) + { + vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut); + vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut); + } + + ++Np; + } + + ++it2; + } + + vd.hostToDevicePos(); + vd.template hostToDeviceProp<0,2>(); + vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE); + vd.template deviceToHostProp<0,2>(); + vd.deviceToHostPos(); + + bool ret = true; + auto it3 = vd.getDomainIterator(); + + float constant = vd.getProp<0>(it3.get()); + float eps = 0.001; + + while (it3.isNext()) + { + float constant2 = vd.getProp<0>(it3.get()); + float constant3 = vd.getProp<2>(it3.get()); + if (fabs(constant - constant2)/constant > eps || fabs(constant - constant3)/constant > eps) + { + Point<3,float> p = vd.getPosRead(it3.get()); + + std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl; + ret = false; + break; + } + + ++it3; + } + BOOST_REQUIRE_EQUAL(ret,true); + } + + auto itp = vd.getDomainAndGhostIterator(); + while (itp.isNext()) + { + auto key = itp.get(); + + vd.getPropWrite<0>(key) = 0.0; + vd.getPropWrite<2>(key) = 0.0; + + ++itp; + } + + { + auto NN = vd.getCellList(r_cut); + float a = 1.0f*k*k; + + // run trough all the particles + ghost + + auto it2 = vd.getDomainIterator(); + + while (it2.isNext()) + { + // particle p + auto p = it2.get(); + Point<3,float> xp = vd.getPosRead(p); + + // Get an iterator over the neighborhood particles of p + auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp)); + + // For each neighborhood particle ... + while (Np.isNext()) + { + auto q = Np.get(); + Point<3,float> xq = vd.getPosRead(q); + + float dist = xp.distance(xq); + + if (dist < r_cut) + { + vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut); + vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut); + } + + ++Np; + } + + ++it2; + } + + vd.hostToDevicePos(); + vd.template hostToDeviceProp<0,2>(); + vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE); + vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE); + vd.template deviceToHostProp<0,2>(); + vd.deviceToHostPos(); + + bool ret = true; + auto it3 = vd.getDomainIterator(); + + float constant = vd.getPropRead<0>(it3.get()); + float eps = 0.001; + + while (it3.isNext()) + { + float constant2 = vd.getPropRead<0>(it3.get()); + float constant3 = vd.getPropRead<0>(it3.get()); + if (fabs(constant - constant2)/constant > eps || fabs(constant - constant3)/constant > eps) + { + Point<3,float> p = vd.getPosRead(it3.get()); + + std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << it3.get().getKey() << " " << v_cl.getProcessUnitID() << std::endl; + ret = false; + break; + } + + ++it3; + } + BOOST_REQUIRE_EQUAL(ret,true); + } + } + + openfpm_finalize(); +} \ No newline at end of file diff --git a/src/lib/pdata.cpp b/src/lib/pdata.cpp index aa5ceb8b2d4af36b719d87b7c190ceae6afd43cd..731f562cd0f64d8fec424cfb7f2843c187ebe654 100644 --- a/src/lib/pdata.cpp +++ b/src/lib/pdata.cpp @@ -23,3 +23,7 @@ const std::string nm_e::attributes::name[] = {"communication","srcgid","dstgid"} const std::string nm_part_v::attributes::name[] = {"id","sub_id"}; const std::string nm_part_e::attributes::name[] = {"id"}; +double tot_merge = 0.0; +double tot_loc_merge = 0.0; +double tot_sendrecv = 0.0; +double tot_pack = 0.0; diff --git a/src/lib/pdata.hpp b/src/lib/pdata.hpp index 919f9d6454cbd2f1668ec01fe77f799cb7d963ef..5418375628606322744fb42d3f801475e2eaedf6 100644 --- a/src/lib/pdata.hpp +++ b/src/lib/pdata.hpp @@ -6,5 +6,9 @@ constexpr int comp_host = 1; constexpr int comp_dev = 2; +extern double tot_merge; +extern double tot_loc_merge; +extern double tot_sendrecv; +extern double tot_pack; #endif diff --git a/src/scripts/postflight b/src/scripts/postflight new file mode 100644 index 0000000000000000000000000000000000000000..0a932ae788699b4fb700e0a6fb005b8608d3535c --- /dev/null +++ b/src/scripts/postflight @@ -0,0 +1,13 @@ +#! /bin/bash + +sed -i '' -e 's/Users\/rundeck/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/source/openfpm_vars +sed -i '' -e 's/openfpm_pre/openfpm/g' /usr/local/openfpm/source/openfpm_vars +sed -i '' -e 's/dependencies\/openfpm_dependencies/dependencies/g' /usr/local/openfpm/source/openfpm_vars +sed -i '' -e 's/Users\/rundeck/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/openfpm_pdata/include/example.mk +sed -i '' -e 's/openfpm_pre/openfpm/g' /usr/local/openfpm/openfpm_pdata/include/example.mk +sed -i '' -e 's/dependencies\/openfpm_dependencies/dependencies/g' /usr/local/openfpm/openfpm_pdata/include/example.mk + +chmod a+x /usr/local/openfpm/dependencies/MPI/bin/* + +echo "export OPAL_PREFIX=/usr/local/openfpm/dependencies/MPI" >> /usr/local/openfpm/source/openfpm_vars + diff --git a/src/scripts/postinst b/src/scripts/postinst new file mode 100644 index 0000000000000000000000000000000000000000..763d3ed561a4529800f9222de37392223c86c1b1 --- /dev/null +++ b/src/scripts/postinst @@ -0,0 +1,9 @@ +#! /bin/bash + +/projects\/ppm\/rundeck\/openfpm_super_bundles\/$1\/openfpm_dep_$1 + +sed -i -e 's/projects\/ppm\/rundeck\/openfpm_super_bundles\/$1\/openfpm_dep_$1/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/source/openfpm_vars +sed -i -e 's/projects\/ppm\/rundeck\/openfpm_super_bundles\/$1\/openfpm_dep_$1/usr\/local\/openfpm\/dependencies/g' /usr/local/openfpm/openfpm_pdata/include/example.mk +echo "export OPAL_PREFIX=/usr/local/openfpm/dependencies/MPI" >> /usr/local/openfpm/source/openfpm_vars + + diff --git a/test_data/sgrid_gpu_output_1_0.vtk b/test_data/sgrid_gpu_output_1_0.vtk index 4435178f5d4ab01c3c81f53f813db628a5871d4c..72ec5aa822068e2094132628121214e86ebbd211 100644 Binary files a/test_data/sgrid_gpu_output_1_0.vtk and b/test_data/sgrid_gpu_output_1_0.vtk differ diff --git a/test_data/sgrid_gpu_output_2_0.vtk b/test_data/sgrid_gpu_output_2_0.vtk index 4a2881cb496b475a232872cbff048fbcf40b56ab..5eec6e0e85213e0d8931fbf7b0b1fe6bf2b2e75d 100644 Binary files a/test_data/sgrid_gpu_output_2_0.vtk and b/test_data/sgrid_gpu_output_2_0.vtk differ diff --git a/test_data/sgrid_gpu_output_2_1.vtk b/test_data/sgrid_gpu_output_2_1.vtk index e9d5c3f52ebe3c43ab7ff2ba66048c893ad4145a..aa60b77a4dff9c1b28009fd600edb99fe7fd24e4 100644 Binary files a/test_data/sgrid_gpu_output_2_1.vtk and b/test_data/sgrid_gpu_output_2_1.vtk differ diff --git a/test_data/sgrid_gpu_output_3_0.vtk b/test_data/sgrid_gpu_output_3_0.vtk index 4a5b5abc09ecf5e98c38429a10c315ae6e69548b..6c3cd35349fa41292454b1fcac5b152b0642192b 100644 Binary files a/test_data/sgrid_gpu_output_3_0.vtk and b/test_data/sgrid_gpu_output_3_0.vtk differ diff --git a/test_data/sgrid_gpu_output_3_1.vtk b/test_data/sgrid_gpu_output_3_1.vtk index a56ffc0cf3af612d82623b60f907b98df7419474..1225f50b8007222b6cce5a4b01faf3b639341f01 100644 Binary files a/test_data/sgrid_gpu_output_3_1.vtk and b/test_data/sgrid_gpu_output_3_1.vtk differ diff --git a/test_data/sgrid_gpu_output_3_2.vtk b/test_data/sgrid_gpu_output_3_2.vtk index bcd4d9291097691367c5b6251c3b4ebb7e478ab8..3d0eebb7e615f16e7862a62df2d60df61879672d 100644 Binary files a/test_data/sgrid_gpu_output_3_2.vtk and b/test_data/sgrid_gpu_output_3_2.vtk differ diff --git a/test_data/test_data_three.h5 b/test_data/test_data_three.h5 new file mode 100644 index 0000000000000000000000000000000000000000..95c9f16dc62de0e39d558c96005849c52d9cee88 Binary files /dev/null and b/test_data/test_data_three.h5 differ